# HG changeset patch # User Chengsong # Date 1653751772 -3600 # Node ID cb702fb4227f74ea3cbd1df5a0cfeb727fea2bcd # Parent d8740017324c1dbbed2ef28b699ca223f1cf86d5 updated diff -r d8740017324c -r cb702fb4227f ChengsongTanPhdThesis/Chapters/Chapter2.tex --- a/ChengsongTanPhdThesis/Chapters/Chapter2.tex Sat May 28 01:04:56 2022 +0100 +++ b/ChengsongTanPhdThesis/Chapters/Chapter2.tex Sat May 28 16:29:32 2022 +0100 @@ -1,6 +1,6 @@ % Chapter Template -\chapter{Regular Expressions and Sulzmanna and Lu's Lexing Algorithm Without Bitcodes} % Main chapter title +\chapter{Regular Expressions and Sulzmann and Lu's Lexing Algorithm Without Bitcodes} % Main chapter title \label{Chapter2} % In chapter 2 \ref{Chapter2} we will introduce the concepts %and notations we diff -r d8740017324c -r cb702fb4227f ChengsongTanPhdThesis/main.tex --- a/ChengsongTanPhdThesis/main.tex Sat May 28 01:04:56 2022 +0100 +++ b/ChengsongTanPhdThesis/main.tex Sat May 28 16:29:32 2022 +0100 @@ -211,24 +211,23 @@ Theoretical results say that regular expression matching should be linear with respect to the input. Under a certain class of regular expressions and inputs though, -practical implementations suffer from non-linear or even +practical implementations often suffer from non-linear or even exponential running time, -allowing a ReDoS (regular expression denial-of-service ) attack. - +allowing ReDoS (regular expression denial-of-service ) attacks. +The reason behind this is that regex libraries in popular programming +languages often want to support richer constructs +than the usual basic regular expressions. Unfortunately, this means that even simple cases +are "infected" with atrocious running time. + -The reason behind is that regex libraries in popular language engines - often want to support richer constructs -than the most basic regular expressions, and lexing rather than matching -is needed for sub-match extraction. - -This work aims to address the above vulnerability by the combination +This work aims to address the above vulnerability by a combination of Brzozowski's derivatives and interactive theorem proving. We give an improved version of Sulzmann and Lu's bit-coded algorithm using -derivatives, which come with a formal guarantee in terms of correctness and -running time as an Isabelle/HOL proof. -Then we improve the algorithm with an even stronger version of +derivatives, and give a formal guarantee in terms of correctness and +size of derivatives, which we formalised in Isabelle/HOL. +Then we improve the algorithm with a stronger version of simplification, and prove a time bound linear to input and -cubic to regular expression size using a technique by +cubic to regular expression size using a technique introduced by Antimirov. The result is an algorithm with provable guarantees on correctness and running time. We believe this is the first diff -r d8740017324c -r cb702fb4227f thys2/blexer2.sc --- a/thys2/blexer2.sc Sat May 28 01:04:56 2022 +0100 +++ b/thys2/blexer2.sc Sat May 28 16:29:32 2022 +0100 @@ -641,7 +641,8 @@ } def rprint(r: Rexp) : Unit = println(shortRexpOutput(r)) -def rsprint(rs: List[Rexp]) = rs.foreach(r => println(shortRexpOutput(r))) +def rsprint(rs: Iterable[Rexp]) = rs.foreach(r => println(shortRexpOutput(r))) + def aprint(a: ARexp) = println(shortRexpOutput(erase(a))) def asprint(as: List[ARexp]) = as.foreach(a => println(shortRexpOutput(erase(a)))) @@ -715,25 +716,45 @@ } } -def attachCtx(r: ARexp, ctx: List[Rexp]) : List[Rexp] = { - val res = breakIntoTerms(oneSimp(L(erase(r), ctx))).map(oneSimp) - res +def strongBreakIntoTerms(r: Rexp): List[Rexp] = r match { + case SEQ(r1, r2) => if(nullable(r1)) + strongBreakIntoTerms(r1).map(r11 => SEQ(r11, r2)) ::: + strongBreakIntoTerms(r2) + else + strongBreakIntoTerms(r1).map(r11 => SEQ(r11, r2)) + case ALTS(r1, r2) => strongBreakIntoTerms(r1) ::: strongBreakIntoTerms(r2) + case ZERO => Nil + case _ => r :: Nil +} + +def attachCtx(r: ARexp, ctx: List[Rexp]) : Set[Rexp] = { + val res = strongBreakIntoTerms((L(erase(r), ctx))).map(oneSimp) + res.toSet } def ABIncludedByC[A, B, C](a: A, b: B, c: C, f: (A, B) => C, inclusionPred: (C, C) => Boolean) : Boolean = { + inclusionPred(f(a, b), c) } -def rexpListInclusion(rs1: List[Rexp], rs2: List[Rexp]) : Boolean = { - rs1.forall(rs2.contains) +def rexpListInclusion(rs1: Set[Rexp], rs2: Set[Rexp]) : Boolean = { + // println("r+ctx---------") + // rsprint(rs1) + // println("acc---------") + // rsprint(rs2) + + val res = rs1.forall(rs2.contains) + // println(res) + // println("end------------------") + res } -def pAKC6(acc: List[Rexp], r: ARexp, ctx: List[Rexp]) : ARexp = { - // println("pakc") +def pAKC6(acc: Set[Rexp], r: ARexp, ctx: List[Rexp]) : ARexp = { + // println("pakc--------r") // println(shortRexpOutput(erase(r))) - // println("acc") + // println("ctx---------") + // rsprint(ctx) + // println("pakc-------acc") // rsprint(acc) - // println("ctx---------") - // rsprint(ctx) - // println("ctx---------end") + // println("r+ctx broken down---------") // rsprint(breakIntoTerms(L(erase(r), ctx)).map(oneSimp)) // rprint(L(erase(r), ctx)) @@ -779,7 +800,7 @@ } -def distinctBy6(xs: List[ARexp], acc: List[Rexp] = Nil) : List[ARexp] = xs match { +def distinctBy6(xs: List[ARexp], acc: Set[Rexp] = Set()) : List[ARexp] = xs match { case Nil => Nil case x :: xs => { @@ -789,12 +810,12 @@ } else{ val pruned = pAKC6(acc, x, Nil) - val newTerms = breakIntoTerms(erase(pruned)) + val newTerms = strongBreakIntoTerms(erase(pruned)) pruned match { case AZERO => distinctBy6(xs, acc) case xPrime => - xPrime :: distinctBy6(xs, newTerms.map(oneSimp) ::: acc)//distinctBy5(xs, addToAcc.map(oneSimp(_)) ::: acc) + xPrime :: distinctBy6(xs, newTerms.map(oneSimp) ++: acc)//distinctBy5(xs, addToAcc.map(oneSimp(_)) ::: acc) } } } @@ -1013,7 +1034,14 @@ def cir_prod(l: Lin, t: Rexp): Lin = t match {//remember this Lin is different from the Lin in Antimirov's paper. Here it does not mean the set of all subsets of linear forms that does not contain zero, but rather the type a set of linear forms case ZERO => Set() case ONE => l - case t => l.map( m => m._2 match {case ZERO => m case ONE => (m._1, t) case p => (m._1, SEQ(p, t)) } ) + case t => l.map( m => m._2 match + { + case ZERO => m + case ONE => (m._1, t) + case p => (m._1, SEQ(p, t)) + } + + ) } def lf(r: Rexp): Lin = r match { case ZERO => Set() @@ -1049,12 +1077,17 @@ case x::xs => pders(xs, ts.foldLeft(Set[Rexp]())((acc, t) => acc ++ pder(x, t))) case Nil => ts } - def pderss(ss: List[List[Char]], t: Rexp): Set[Rexp] = ss.foldLeft( Set[Rexp]() )( (acc, s) => pders_single(s, t) ++ acc ) + def pderss(ss: List[List[Char]], t: Rexp): Set[Rexp] = + ss.foldLeft( Set[Rexp]() )( (acc, s) => pders_single(s, t) ++ acc ) def pdera(t: Rexp): Set[Rexp] = lf(t).map(mon => mon._2) //all implementation of partial derivatives that involve set union are potentially buggy //because they don't include the original regular term before they are pdered. //now only pderas is fixed. - def pderas(t: Set[Rexp], d: Int): Set[Rexp] = if(d > 0) pderas(lfs(t).map(mon => mon._2), d - 1) ++ t else lfs(t).map(mon => mon._2) ++ t//repeated application of pderas over the newest set of pders. + def pderas(t: Set[Rexp], d: Int): Set[Rexp] = + if(d > 0) + pderas(lfs(t).map(mon => mon._2), d - 1) ++ t + else + lfs(t).map(mon => mon._2) ++ t//repeated application of pderas over the newest set of pders. def pderUNIV(r: Rexp) : Set[Rexp] = pderas(Set(r), awidth(r) + 1) def awidth(r: Rexp) : Int = r match { case CHAR(c) => 1 @@ -1223,12 +1256,11 @@ } // small() -generator_test() +// generator_test() def counterexample_check() { - val r = STAR(SEQ(ALTS(ALTS(CHAR('b'),CHAR('c')), - SEQ(CHAR('b'),CHAR('b'))),ALTS(SEQ(ONE,CHAR('b')),CHAR('a')))) - val s = "bbbb" + val r = SEQ(STAR(CHAR('c')),STAR(SEQ(STAR(CHAR('c')),ONE)))//STAR(SEQ(ALTS(STAR(CHAR('c')),CHAR('c')),SEQ(ALTS(CHAR('c'),ONE),ONE))) + val s = "ccc" val bdStrong5 = bdersStrong6(s.toList, internalise(r)) val bdStrong5Set = breakIntoTerms(erase(bdStrong5)) val pdersSet = pderUNIV(r)//.map(oneSimp).flatMap(r => breakIntoTerms(r)) @@ -1242,6 +1274,12 @@ rsprint(pdersSet.toList) } // counterexample_check() +def linform_test() { + val r = STAR(SEQ(STAR(CHAR('c')), ONE)) + val r_linforms = lf(r) + println(r_linforms.size) +} +linform_test() // 1 def newStrong_test() { val r2 = (CHAR('b') | ONE)