updated
authorChengsong
Sat, 28 May 2022 16:29:32 +0100
changeset 526 cb702fb4227f
parent 525 d8740017324c
child 527 2c907b118f78
updated
ChengsongTanPhdThesis/Chapters/Chapter2.tex
ChengsongTanPhdThesis/main.tex
thys2/blexer2.sc
--- a/ChengsongTanPhdThesis/Chapters/Chapter2.tex	Sat May 28 01:04:56 2022 +0100
+++ b/ChengsongTanPhdThesis/Chapters/Chapter2.tex	Sat May 28 16:29:32 2022 +0100
@@ -1,6 +1,6 @@
 % Chapter Template
 
-\chapter{Regular Expressions and Sulzmanna and Lu's Lexing Algorithm Without Bitcodes} % Main chapter title
+\chapter{Regular Expressions and Sulzmann and Lu's Lexing Algorithm Without Bitcodes} % Main chapter title
 
 \label{Chapter2} % In chapter 2 \ref{Chapter2} we will introduce the concepts
 %and notations we 
--- a/ChengsongTanPhdThesis/main.tex	Sat May 28 01:04:56 2022 +0100
+++ b/ChengsongTanPhdThesis/main.tex	Sat May 28 16:29:32 2022 +0100
@@ -211,24 +211,23 @@
 Theoretical results say that regular expression matching
 should be linear with respect to the input.
 Under a certain class of regular expressions and inputs though,
-practical implementations  suffer from non-linear or even 
+practical implementations often suffer from non-linear or even 
 exponential running time,
-allowing a ReDoS (regular expression denial-of-service ) attack.
-
+allowing ReDoS (regular expression denial-of-service ) attacks.
+The reason behind this is that regex libraries in popular programming
+languages often want to support richer constructs
+than the usual basic regular expressions. Unfortunately, this means that even simple cases
+are "infected" with atrocious running time.
+ 
 
-The reason behind is that regex libraries in popular language engines
- often want to support richer constructs
-than  the most basic regular expressions, and lexing rather than matching
-is needed for sub-match extraction.
-
-This work aims to address the above vulnerability by the combination
+This work aims to address the above vulnerability by a combination
 of Brzozowski's derivatives and interactive theorem proving. We give an 
 improved version of  Sulzmann and Lu's bit-coded algorithm using 
-derivatives, which come with a formal guarantee in terms of correctness and 
-running time as an Isabelle/HOL proof.
-Then we improve the algorithm with an even stronger version of 
+derivatives, and give a formal guarantee in terms of correctness and 
+size of derivatives, which we formalised in Isabelle/HOL.
+Then we improve the algorithm with a stronger version of 
 simplification, and prove a time bound linear to input and
-cubic to regular expression size using a technique by
+cubic to regular expression size using a technique introduced by
 Antimirov.
 The result is an algorithm with provable guarantees on 
 correctness and running time. We believe this is the first 
--- a/thys2/blexer2.sc	Sat May 28 01:04:56 2022 +0100
+++ b/thys2/blexer2.sc	Sat May 28 16:29:32 2022 +0100
@@ -641,7 +641,8 @@
 }
 
 def rprint(r: Rexp) : Unit = println(shortRexpOutput(r))
-def rsprint(rs: List[Rexp]) = rs.foreach(r => println(shortRexpOutput(r)))
+def rsprint(rs: Iterable[Rexp]) = rs.foreach(r => println(shortRexpOutput(r)))
+
 def aprint(a: ARexp) = println(shortRexpOutput(erase(a)))
 def asprint(as: List[ARexp]) = as.foreach(a => println(shortRexpOutput(erase(a))))
 
@@ -715,25 +716,45 @@
   }
 }
 
-def attachCtx(r: ARexp, ctx: List[Rexp]) : List[Rexp] = {
-  val res = breakIntoTerms(oneSimp(L(erase(r), ctx))).map(oneSimp)
-  res
+def strongBreakIntoTerms(r: Rexp): List[Rexp] = r match {
+  case SEQ(r1, r2)  => if(nullable(r1)) 
+                          strongBreakIntoTerms(r1).map(r11 => SEQ(r11, r2)) ::: 
+                          strongBreakIntoTerms(r2) 
+                       else 
+                          strongBreakIntoTerms(r1).map(r11 => SEQ(r11, r2))
+  case ALTS(r1, r2) => strongBreakIntoTerms(r1) ::: strongBreakIntoTerms(r2)
+  case ZERO => Nil
+  case _ => r :: Nil
+}
+
+def attachCtx(r: ARexp, ctx: List[Rexp]) : Set[Rexp] = {
+  val res = strongBreakIntoTerms((L(erase(r), ctx))).map(oneSimp)
+  res.toSet
 }
 
 def ABIncludedByC[A, B, C](a: A, b: B, c: C, f: (A, B) => C, inclusionPred: (C, C) => Boolean) : Boolean = {
+  
   inclusionPred(f(a, b), c)
 }
-def rexpListInclusion(rs1: List[Rexp], rs2: List[Rexp]) : Boolean = {
-  rs1.forall(rs2.contains)
+def rexpListInclusion(rs1: Set[Rexp], rs2: Set[Rexp]) : Boolean = {
+  // println("r+ctx---------")
+  // rsprint(rs1)
+  // println("acc---------")
+  // rsprint(rs2)
+  
+  val res = rs1.forall(rs2.contains)
+  // println(res)
+  // println("end------------------")
+  res
 }
-def pAKC6(acc: List[Rexp], r: ARexp, ctx: List[Rexp]) : ARexp = {
-  // println("pakc")
+def pAKC6(acc: Set[Rexp], r: ARexp, ctx: List[Rexp]) : ARexp = {
+  // println("pakc--------r")
   // println(shortRexpOutput(erase(r)))
-  // println("acc")
+  //   println("ctx---------")
+  // rsprint(ctx)
+  // println("pakc-------acc")
   // rsprint(acc)
-  // println("ctx---------")
-  // rsprint(ctx)
-  // println("ctx---------end")
+  // println("r+ctx broken down---------")
   // rsprint(breakIntoTerms(L(erase(r), ctx)).map(oneSimp))
 
   // rprint(L(erase(r), ctx))
@@ -779,7 +800,7 @@
 }
 
 
-def distinctBy6(xs: List[ARexp], acc: List[Rexp] = Nil) : List[ARexp] = xs match {
+def distinctBy6(xs: List[ARexp], acc: Set[Rexp] = Set()) : List[ARexp] = xs match {
   case Nil => 
     Nil
   case x :: xs => {
@@ -789,12 +810,12 @@
     }
     else{
       val pruned = pAKC6(acc, x, Nil)
-      val newTerms = breakIntoTerms(erase(pruned))
+      val newTerms = strongBreakIntoTerms(erase(pruned))
       pruned match {
         case AZERO => 
           distinctBy6(xs, acc)
         case xPrime => 
-          xPrime :: distinctBy6(xs, newTerms.map(oneSimp) ::: acc)//distinctBy5(xs, addToAcc.map(oneSimp(_)) ::: acc)
+          xPrime :: distinctBy6(xs, newTerms.map(oneSimp) ++: acc)//distinctBy5(xs, addToAcc.map(oneSimp(_)) ::: acc)
       }
     }
   }
@@ -1013,7 +1034,14 @@
   def cir_prod(l: Lin, t: Rexp): Lin = t match {//remember this Lin is different from the Lin in Antimirov's paper. Here it does not mean the set of all subsets of linear forms that does not contain zero, but rather the type a set of linear forms
     case ZERO => Set()
     case ONE => l
-    case t => l.map( m => m._2 match {case ZERO => m case ONE => (m._1, t) case p => (m._1, SEQ(p, t)) }  )
+    case t => l.map( m => m._2 match 
+      {
+        case ZERO => m 
+        case ONE => (m._1, t) 
+        case p => (m._1, SEQ(p, t)) 
+      }  
+    
+    )
   }
   def lf(r: Rexp): Lin = r match {
     case ZERO => Set()
@@ -1049,12 +1077,17 @@
     case x::xs => pders(xs, ts.foldLeft(Set[Rexp]())((acc, t) => acc ++ pder(x, t)))
     case Nil => ts 
   }
-  def pderss(ss: List[List[Char]], t: Rexp): Set[Rexp] = ss.foldLeft( Set[Rexp]() )( (acc, s) => pders_single(s, t) ++ acc )
+  def pderss(ss: List[List[Char]], t: Rexp): Set[Rexp] = 
+    ss.foldLeft( Set[Rexp]() )( (acc, s) => pders_single(s, t) ++ acc )
   def pdera(t: Rexp): Set[Rexp] = lf(t).map(mon => mon._2)
   //all implementation of partial derivatives that involve set union are potentially buggy
   //because they don't include the original regular term before they are pdered.
   //now only pderas is fixed.
-  def pderas(t: Set[Rexp], d: Int): Set[Rexp] = if(d > 0) pderas(lfs(t).map(mon => mon._2), d - 1) ++ t else lfs(t).map(mon => mon._2) ++ t//repeated application of pderas over the newest set of pders.
+  def pderas(t: Set[Rexp], d: Int): Set[Rexp] = 
+    if(d > 0) 
+      pderas(lfs(t).map(mon => mon._2), d - 1) ++ t 
+    else 
+      lfs(t).map(mon => mon._2) ++ t//repeated application of pderas over the newest set of pders.
   def pderUNIV(r: Rexp) : Set[Rexp] = pderas(Set(r), awidth(r) + 1)
   def awidth(r: Rexp) : Int = r match {
     case CHAR(c) => 1
@@ -1223,12 +1256,11 @@
 
 }
 // small()
-generator_test()
+// generator_test()
 
 def counterexample_check() {
-  val r = STAR(SEQ(ALTS(ALTS(CHAR('b'),CHAR('c')),
-    SEQ(CHAR('b'),CHAR('b'))),ALTS(SEQ(ONE,CHAR('b')),CHAR('a'))))
-  val s = "bbbb"
+  val r = SEQ(STAR(CHAR('c')),STAR(SEQ(STAR(CHAR('c')),ONE)))//STAR(SEQ(ALTS(STAR(CHAR('c')),CHAR('c')),SEQ(ALTS(CHAR('c'),ONE),ONE)))
+  val s = "ccc"
   val bdStrong5 = bdersStrong6(s.toList, internalise(r))
   val bdStrong5Set = breakIntoTerms(erase(bdStrong5))
   val pdersSet = pderUNIV(r)//.map(oneSimp).flatMap(r => breakIntoTerms(r))
@@ -1242,6 +1274,12 @@
   rsprint(pdersSet.toList)
 }
 // counterexample_check()
+def linform_test() {
+  val r = STAR(SEQ(STAR(CHAR('c')), ONE))
+  val r_linforms = lf(r)
+  println(r_linforms.size)
+}
+linform_test()
 // 1
 def newStrong_test() {
   val r2 = (CHAR('b') | ONE)