--- a/ChengsongTanPhdThesis/main.tex Sat May 28 01:04:56 2022 +0100
+++ b/ChengsongTanPhdThesis/main.tex Sat May 28 16:29:32 2022 +0100
@@ -211,24 +211,23 @@
Theoretical results say that regular expression matching
should be linear with respect to the input.
Under a certain class of regular expressions and inputs though,
-practical implementations suffer from non-linear or even
+practical implementations often suffer from non-linear or even
exponential running time,
-allowing a ReDoS (regular expression denial-of-service ) attack.
-
+allowing ReDoS (regular expression denial-of-service ) attacks.
+The reason behind this is that regex libraries in popular programming
+languages often want to support richer constructs
+than the usual basic regular expressions. Unfortunately, this means that even simple cases
+are "infected" with atrocious running time.
+
-The reason behind is that regex libraries in popular language engines
- often want to support richer constructs
-than the most basic regular expressions, and lexing rather than matching
-is needed for sub-match extraction.
-
-This work aims to address the above vulnerability by the combination
+This work aims to address the above vulnerability by a combination
of Brzozowski's derivatives and interactive theorem proving. We give an
improved version of Sulzmann and Lu's bit-coded algorithm using
-derivatives, which come with a formal guarantee in terms of correctness and
-running time as an Isabelle/HOL proof.
-Then we improve the algorithm with an even stronger version of
+derivatives, and give a formal guarantee in terms of correctness and
+size of derivatives, which we formalised in Isabelle/HOL.
+Then we improve the algorithm with a stronger version of
simplification, and prove a time bound linear to input and
-cubic to regular expression size using a technique by
+cubic to regular expression size using a technique introduced by
Antimirov.
The result is an algorithm with provable guarantees on
correctness and running time. We believe this is the first
--- a/thys2/blexer2.sc Sat May 28 01:04:56 2022 +0100
+++ b/thys2/blexer2.sc Sat May 28 16:29:32 2022 +0100
@@ -641,7 +641,8 @@
}
def rprint(r: Rexp) : Unit = println(shortRexpOutput(r))
-def rsprint(rs: List[Rexp]) = rs.foreach(r => println(shortRexpOutput(r)))
+def rsprint(rs: Iterable[Rexp]) = rs.foreach(r => println(shortRexpOutput(r)))
+
def aprint(a: ARexp) = println(shortRexpOutput(erase(a)))
def asprint(as: List[ARexp]) = as.foreach(a => println(shortRexpOutput(erase(a))))
@@ -715,25 +716,45 @@
}
}
-def attachCtx(r: ARexp, ctx: List[Rexp]) : List[Rexp] = {
- val res = breakIntoTerms(oneSimp(L(erase(r), ctx))).map(oneSimp)
- res
+def strongBreakIntoTerms(r: Rexp): List[Rexp] = r match {
+ case SEQ(r1, r2) => if(nullable(r1))
+ strongBreakIntoTerms(r1).map(r11 => SEQ(r11, r2)) :::
+ strongBreakIntoTerms(r2)
+ else
+ strongBreakIntoTerms(r1).map(r11 => SEQ(r11, r2))
+ case ALTS(r1, r2) => strongBreakIntoTerms(r1) ::: strongBreakIntoTerms(r2)
+ case ZERO => Nil
+ case _ => r :: Nil
+}
+
+def attachCtx(r: ARexp, ctx: List[Rexp]) : Set[Rexp] = {
+ val res = strongBreakIntoTerms((L(erase(r), ctx))).map(oneSimp)
+ res.toSet
}
def ABIncludedByC[A, B, C](a: A, b: B, c: C, f: (A, B) => C, inclusionPred: (C, C) => Boolean) : Boolean = {
+
inclusionPred(f(a, b), c)
}
-def rexpListInclusion(rs1: List[Rexp], rs2: List[Rexp]) : Boolean = {
- rs1.forall(rs2.contains)
+def rexpListInclusion(rs1: Set[Rexp], rs2: Set[Rexp]) : Boolean = {
+ // println("r+ctx---------")
+ // rsprint(rs1)
+ // println("acc---------")
+ // rsprint(rs2)
+
+ val res = rs1.forall(rs2.contains)
+ // println(res)
+ // println("end------------------")
+ res
}
-def pAKC6(acc: List[Rexp], r: ARexp, ctx: List[Rexp]) : ARexp = {
- // println("pakc")
+def pAKC6(acc: Set[Rexp], r: ARexp, ctx: List[Rexp]) : ARexp = {
+ // println("pakc--------r")
// println(shortRexpOutput(erase(r)))
- // println("acc")
+ // println("ctx---------")
+ // rsprint(ctx)
+ // println("pakc-------acc")
// rsprint(acc)
- // println("ctx---------")
- // rsprint(ctx)
- // println("ctx---------end")
+ // println("r+ctx broken down---------")
// rsprint(breakIntoTerms(L(erase(r), ctx)).map(oneSimp))
// rprint(L(erase(r), ctx))
@@ -779,7 +800,7 @@
}
-def distinctBy6(xs: List[ARexp], acc: List[Rexp] = Nil) : List[ARexp] = xs match {
+def distinctBy6(xs: List[ARexp], acc: Set[Rexp] = Set()) : List[ARexp] = xs match {
case Nil =>
Nil
case x :: xs => {
@@ -789,12 +810,12 @@
}
else{
val pruned = pAKC6(acc, x, Nil)
- val newTerms = breakIntoTerms(erase(pruned))
+ val newTerms = strongBreakIntoTerms(erase(pruned))
pruned match {
case AZERO =>
distinctBy6(xs, acc)
case xPrime =>
- xPrime :: distinctBy6(xs, newTerms.map(oneSimp) ::: acc)//distinctBy5(xs, addToAcc.map(oneSimp(_)) ::: acc)
+ xPrime :: distinctBy6(xs, newTerms.map(oneSimp) ++: acc)//distinctBy5(xs, addToAcc.map(oneSimp(_)) ::: acc)
}
}
}
@@ -1013,7 +1034,14 @@
def cir_prod(l: Lin, t: Rexp): Lin = t match {//remember this Lin is different from the Lin in Antimirov's paper. Here it does not mean the set of all subsets of linear forms that does not contain zero, but rather the type a set of linear forms
case ZERO => Set()
case ONE => l
- case t => l.map( m => m._2 match {case ZERO => m case ONE => (m._1, t) case p => (m._1, SEQ(p, t)) } )
+ case t => l.map( m => m._2 match
+ {
+ case ZERO => m
+ case ONE => (m._1, t)
+ case p => (m._1, SEQ(p, t))
+ }
+
+ )
}
def lf(r: Rexp): Lin = r match {
case ZERO => Set()
@@ -1049,12 +1077,17 @@
case x::xs => pders(xs, ts.foldLeft(Set[Rexp]())((acc, t) => acc ++ pder(x, t)))
case Nil => ts
}
- def pderss(ss: List[List[Char]], t: Rexp): Set[Rexp] = ss.foldLeft( Set[Rexp]() )( (acc, s) => pders_single(s, t) ++ acc )
+ def pderss(ss: List[List[Char]], t: Rexp): Set[Rexp] =
+ ss.foldLeft( Set[Rexp]() )( (acc, s) => pders_single(s, t) ++ acc )
def pdera(t: Rexp): Set[Rexp] = lf(t).map(mon => mon._2)
//all implementation of partial derivatives that involve set union are potentially buggy
//because they don't include the original regular term before they are pdered.
//now only pderas is fixed.
- def pderas(t: Set[Rexp], d: Int): Set[Rexp] = if(d > 0) pderas(lfs(t).map(mon => mon._2), d - 1) ++ t else lfs(t).map(mon => mon._2) ++ t//repeated application of pderas over the newest set of pders.
+ def pderas(t: Set[Rexp], d: Int): Set[Rexp] =
+ if(d > 0)
+ pderas(lfs(t).map(mon => mon._2), d - 1) ++ t
+ else
+ lfs(t).map(mon => mon._2) ++ t//repeated application of pderas over the newest set of pders.
def pderUNIV(r: Rexp) : Set[Rexp] = pderas(Set(r), awidth(r) + 1)
def awidth(r: Rexp) : Int = r match {
case CHAR(c) => 1
@@ -1223,12 +1256,11 @@
}
// small()
-generator_test()
+// generator_test()
def counterexample_check() {
- val r = STAR(SEQ(ALTS(ALTS(CHAR('b'),CHAR('c')),
- SEQ(CHAR('b'),CHAR('b'))),ALTS(SEQ(ONE,CHAR('b')),CHAR('a'))))
- val s = "bbbb"
+ val r = SEQ(STAR(CHAR('c')),STAR(SEQ(STAR(CHAR('c')),ONE)))//STAR(SEQ(ALTS(STAR(CHAR('c')),CHAR('c')),SEQ(ALTS(CHAR('c'),ONE),ONE)))
+ val s = "ccc"
val bdStrong5 = bdersStrong6(s.toList, internalise(r))
val bdStrong5Set = breakIntoTerms(erase(bdStrong5))
val pdersSet = pderUNIV(r)//.map(oneSimp).flatMap(r => breakIntoTerms(r))
@@ -1242,6 +1274,12 @@
rsprint(pdersSet.toList)
}
// counterexample_check()
+def linform_test() {
+ val r = STAR(SEQ(STAR(CHAR('c')), ONE))
+ val r_linforms = lf(r)
+ println(r_linforms.size)
+}
+linform_test()
// 1
def newStrong_test() {
val r2 = (CHAR('b') | ONE)