main_testing3/re.scala
changeset 463 0315d9983cd0
parent 457 9cf317975ae7
child 475 59e005dcf163
equal deleted inserted replaced
462:34feeb53c0ba 463:0315d9983cd0
     1 // Main Part 3 about Regular Expression Matching
     1 // Main Part 3 about Regular Expression Matching
     2 //==============================================
     2 //==============================================
     3 
     3 
     4 object M3 {
     4 object M3 {
     5 
     5 
     6 // Regular Expressions
       
     7 abstract class Rexp
     6 abstract class Rexp
     8 case object ZERO extends Rexp
     7 case object ZERO extends Rexp
     9 case object ONE extends Rexp
     8 case object ONE extends Rexp
    10 case class CHAR(c: Char) extends Rexp
     9 case class CHAR(c: Char) extends Rexp
    11 case class ALTs(rs: List[Rexp]) extends Rexp  // alternatives 
    10 case class ALTs(rs: List[Rexp]) extends Rexp  // alternatives 
    16 //the usual binary choice and binary sequence can be defined 
    15 //the usual binary choice and binary sequence can be defined 
    17 //in terms of ALTs and SEQs
    16 //in terms of ALTs and SEQs
    18 def ALT(r1: Rexp, r2: Rexp) = ALTs(List(r1, r2))
    17 def ALT(r1: Rexp, r2: Rexp) = ALTs(List(r1, r2))
    19 def SEQ(r1: Rexp, r2: Rexp) = SEQs(List(r1, r2))
    18 def SEQ(r1: Rexp, r2: Rexp) = SEQs(List(r1, r2))
    20 
    19 
    21 // some convenience for typing in regular expressions
    20 
       
    21 // some convenience for typing regular expressions
    22 import scala.language.implicitConversions    
    22 import scala.language.implicitConversions    
    23 import scala.language.reflectiveCalls 
    23 import scala.language.reflectiveCalls 
    24 
    24 
    25 def charlist2rexp(s: List[Char]): Rexp = s match {
    25 def charlist2rexp(s: List[Char]): Rexp = s match {
    26   case Nil => ONE
    26   case Nil => ONE
    41   def % = STAR(s)
    41   def % = STAR(s)
    42   def ~ (r: Rexp) = SEQ(s, r)
    42   def ~ (r: Rexp) = SEQ(s, r)
    43   def ~ (r: String) = SEQ(s, r)
    43   def ~ (r: String) = SEQ(s, r)
    44 }
    44 }
    45 
    45 
    46 // (1) 
    46 // examples for the implicits:
       
    47 // ALT(CHAR('a'), CHAR('b'))
       
    48 // val areg : Rexp = "a" | "b"
       
    49 
       
    50 // SEQ(CHAR('a'), CHAR('b')) 
       
    51 // val sreg : Rexp = "a" ~ "b"
       
    52 
       
    53 
       
    54 // ADD YOUR CODE BELOW
       
    55 //======================
       
    56 
       
    57 // (1)
    47 def nullable (r: Rexp) : Boolean = r match {
    58 def nullable (r: Rexp) : Boolean = r match {
    48   case ZERO => false
    59   case ZERO     => false
    49   case ONE => true
    60   case ONE      => true
    50   case CHAR(_) => false
    61   case CHAR(_)  => false
    51   case ALTs(rs) => rs.exists(nullable)
    62   case ALTs(rs) => (for(reg <- rs) yield nullable(reg)).exists(_ == true)
    52   case SEQs(rs) => rs.forall(nullable)
    63   case SEQs(rs) => (for(reg <- rs) yield nullable(reg)).forall(_ == true)
    53   case STAR(_) => true
    64   case STAR(_)  => true
    54 }
    65 }
       
    66 
       
    67 /*
       
    68 nullable(ZERO) == false
       
    69 nullable(ONE) == true
       
    70 nullable(CHAR('a')) == false
       
    71 nullable(ZERO | ONE) == true
       
    72 nullable(ZERO | CHAR('a')) == false
       
    73 nullable(ONE ~ ONE) == true
       
    74 nullable(ONE ~ CHAR('a')) == false
       
    75 nullable(STAR(ZERO)) == true
       
    76 nullable(ALTs(List(ONE, CHAR('a'), ZERO))) == true
       
    77 nullable(SEQs(List(ONE, ALTs(List(ONE, CHAR('a'), ZERO)), STAR(ZERO)))) == true
       
    78 */
    55 
    79 
    56 // (2) 
    80 // (2) 
    57 def der(c: Char, r: Rexp) : Rexp = r match {
    81 def der (c: Char, r: Rexp) : Rexp = r match {
    58   case ZERO => ZERO
    82   case ZERO           => ZERO
    59   case ONE => ZERO
    83   case ONE            => ZERO
    60   case CHAR(d) => if (c == d) ONE else ZERO
    84   case CHAR(d)        => if(c == d) ONE else ZERO
    61   case ALTs(rs) => ALTs(rs.map(der(c, _)))
    85   case ALTs(rs)       => ALTs(for(reg <- rs) yield der(c, reg))
    62   case SEQs(Nil) => ZERO
    86   case SEQs(Nil)      => ZERO
    63   case SEQs(r1::rs) => 
    87   case SEQs(r :: rs)  => if(nullable(r)) ALT(SEQs(der(c, r) :: rs), der(c, SEQs(rs))) else SEQs(der(c, r) :: rs)
    64     if (nullable(r1)) ALT(SEQs(der(c, r1)::rs), der(c, SEQs(rs)))
    88   case STAR(r)        => SEQ(der(c,r), STAR(r))
    65     else SEQs(der(c, r1):: rs)
    89 }
    66   case STAR(r1) => SEQ(der(c, r1), STAR(r1))
    90 
    67 }
    91 /*
    68 
    92 der('a', ZERO | ONE) == (ZERO | ZERO)
       
    93 der('a', (CHAR('a') | ONE) ~ CHAR('a')) == ALT((ONE | ZERO) ~ CHAR('a'), SEQs(List(ONE)))
       
    94 der('a', (CHAR('a') | CHAR('a')) ~ CHAR('a')) == (ONE | ONE) ~ CHAR('a')
       
    95 der('a', STAR(CHAR('a'))) == (ONE ~ STAR(CHAR('a')))
       
    96 der('b', STAR(CHAR('a'))) == (ZERO ~ STAR(CHAR('a')))
       
    97 */
    69 
    98 
    70 // (3) 
    99 // (3) 
    71 def denest(rs: List[Rexp]) : List[Rexp] = rs match {
   100 def denest(rs: List[Rexp]) : List[Rexp] = rs match {
    72   case Nil => Nil
   101   case Nil                => Nil
    73   case ZERO::tl => denest(tl)
   102   case ZERO :: rest       => denest(rest)
    74   case ALTs(rs1)::rs2 => rs1 ::: denest(rs2)  
   103   case ALTs(rgs) :: rest  => rgs ::: denest(rest)
    75   case r::rs => r :: denest(rs) 
   104   case r :: rest          => r :: denest(rest)
    76 }
   105 }
       
   106 
       
   107 /*
       
   108 denest(List(ONE, ZERO, ALTs(List(ONE, CHAR('a'))))) == List(ONE, ONE, CHAR('a'))
       
   109 denest(List(ONE ~ ONE, ZERO, ZERO | ONE)) == List(ONE ~ ONE, ZERO, ONE)
       
   110 */
    77 
   111 
    78 // (4)
   112 // (4)
    79 def flts(rs: List[Rexp], acc: List[Rexp] = Nil) : List[Rexp] = rs match {
   113 def flts(rs: List[Rexp], acc: List[Rexp] = Nil) : List[Rexp] = rs match {
    80   case Nil => acc
   114   case Nil                => acc
    81   case ZERO::rs => ZERO::Nil
   115   case ZERO :: rest       => List(ZERO)
    82   case ONE::rs => flts(rs, acc)
   116   case ONE :: rest        => flts(rest, acc)
    83   case SEQs(rs1)::rs => flts(rs, acc ::: rs1)
   117   case SEQs(rgs) :: rest  => flts(rest, acc ::: rgs)
    84   case r::rs => flts(rs, acc :+ r) 
   118   case r :: rest          => flts(rest, acc ::: List(r)) 
    85 }
   119 }
       
   120 
       
   121 /*
       
   122 flts(List(CHAR('a'), ZERO, ONE), Nil) == List(ZERO)
       
   123 flts(List(CHAR('a'), ONE, ONE, CHAR('b')), Nil) == List(CHAR('a'), CHAR('b'))
       
   124 flts(List(ONE ~ CHAR('a'), CHAR('b') ~ ONE), Nil) == List(ONE, CHAR('a'), CHAR('b'), ONE)
       
   125 */
    86 
   126 
    87 // (5)
   127 // (5)
    88 def ALTs_smart(rs: List[Rexp]) : Rexp = rs match {
   128 def ALTs_smart(rs: List[Rexp]) : Rexp = rs match {
    89   case Nil => ZERO
   129   case Nil      => ZERO
    90   case r::Nil => r  
   130   case List(r)  => r
    91   case rs => ALTs(rs)
   131   case _        => ALTs(rs)
    92 }
   132 }
    93 
   133 
    94 def SEQs_smart(rs: List[Rexp]) : Rexp = rs match {
   134 def SEQs_smart(rs: List[Rexp]) : Rexp = rs match {
    95   case Nil => ONE
   135   case Nil      => ONE
    96   case ZERO::Nil => ZERO
   136   case List(r)  => r
    97   case r::Nil => r
   137   case _        => SEQs(rs)
    98   case rs => SEQs(rs) 
   138 }
    99 }
   139 
   100 
   140 /*
   101 // (6) 
   141 SEQs_smart(Nil) == ONE
   102 
   142 SEQs_smart(List(ZERO)) == ZERO
       
   143 SEQs_smart(List(CHAR('a'))) == CHAR('a')
       
   144 SEQs_smart(List(ONE ~ ONE)) == ONE ~ ONE
       
   145 SEQs_smart(List(ONE, ONE)) == SEQs(List(ONE, ONE))
       
   146 ALTs_smart(Nil) == ZERO
       
   147 ALTs_smart(List(ONE ~ ONE)) == ONE ~ ONE
       
   148 ALTs_smart(List(ZERO, ZERO)) == ALTs(List(ZERO, ZERO))
       
   149 */
       
   150 
       
   151 // (6)
   103 def simp(r: Rexp) : Rexp = r match {
   152 def simp(r: Rexp) : Rexp = r match {
   104   case ALTs(rs) => 
   153   case ALTs(rs) => ALTs_smart(denest(for(reg <- rs) yield simp(reg)).distinct)
   105     ALTs_smart(denest(rs.map(simp)).distinct)
   154   case SEQs(rs) => SEQs_smart(flts(for(reg <- rs) yield simp(reg)))
   106   case SEQs(rs) => 
   155   case _        => r
   107     SEQs_smart(flts(rs.map(simp)))
   156 }
   108   case r => r
   157 
   109 }
   158 /*
   110 
   159 simp(ZERO | ONE) == ONE
   111 //println("Simp tests")
   160 simp(STAR(ZERO | ONE)) == STAR(ZERO | ONE)
   112 //println(simp(ALT(ONE | CHAR('a'), CHAR('a') | ONE)))
   161 simp(ONE ~ (ONE ~ (ONE ~ CHAR('a')))) == CHAR('a')
   113 //println(simp(((CHAR('a') | ZERO) ~ ONE) | 
   162 simp(((ONE ~ ONE) ~ ONE) ~ CHAR('a')) == CHAR('a')
   114 //              (((ONE | CHAR('b')) | CHAR('c')) ~ (CHAR('d') ~ ZERO))))
   163 simp(((ONE | ONE) ~ ONE) ~ CHAR('a')) == CHAR('a')
   115 
   164 simp(ONE ~ (ONE ~ (ONE ~ ZERO))) == ZERO
   116 
   165 simp(ALT(ONE ~ (ONE ~ (ONE ~ ZERO)), CHAR('a'))) == CHAR('a')
   117 // (7) 
   166 simp(CHAR('a') | CHAR('a')) == CHAR('a')
   118 
   167 simp(CHAR('a') ~ CHAR('a')) == CHAR('a') ~ CHAR('a')
       
   168 simp(ONE | CHAR('a')) == (ONE | CHAR('a'))
       
   169 simp(ALT((CHAR('a') | ZERO) ~ ONE,((ONE | CHAR('b')) | CHAR('c')) ~ (CHAR('d') ~ ZERO))) == CHAR('a')
       
   170 simp((ZERO | ((ZERO | ZERO) | (ZERO | ZERO))) ~ ((ONE | ZERO) | ONE ) ~ (CHAR('a'))) == ZERO
       
   171 simp(ALT(ONE | ONE, ONE | ONE)) == ONE
       
   172 simp(ALT(ZERO | CHAR('a'), CHAR('a') | ZERO)) == CHAR('a')
       
   173 simp(ALT(ONE | CHAR('a'), CHAR('a') | ONE)) == ALT(ONE, CHAR('a'))
       
   174 simp(ALTs(Nil)) == ZERO
       
   175 simp(SEQs(List(CHAR('a')))) == CHAR('a')
       
   176 */
       
   177 
       
   178 // (7)
   119 def ders (s: List[Char], r: Rexp) : Rexp = s match {
   179 def ders (s: List[Char], r: Rexp) : Rexp = s match {
   120   case Nil => r
   180   case Nil      => r
   121   case c::s => ders(s, simp(der(c, r)))
   181   case c :: cs  => ders(cs, simp(der(c, r)))
   122 }
   182 }
   123 
   183 def matcher(r: Rexp, s: String): Boolean = {
   124 // main matcher function
   184   val derivatives = ders(s.toList, r)
   125 def matcher(r: Rexp, s: String) = nullable(ders(s.toList, r))
   185   nullable(derivatives)
       
   186 }
       
   187 
       
   188 /*
       
   189 val EVIL = SEQ(STAR(STAR(CHAR('a'))), CHAR('b'))
       
   190 ders("aaaaa".toList, EVIL) == SEQs(List(STAR(CHAR('a')), STAR(STAR(CHAR('a'))), CHAR('b')))
       
   191 ders(List('b'), EVIL) == ONE
       
   192 ders("bb".toList, EVIL) == ZERO
       
   193 matcher(EVIL, "a" * 5 ++ "b") == true
       
   194 matcher(EVIL, "b") == true
       
   195 matcher(EVIL, "bb") == false
       
   196 matcher("abc", "abc") == true
       
   197 matcher(("ab" | "a") ~ (ONE | "bc"), "abc") == true
       
   198 matcher(ONE, "") == true
       
   199 matcher(ZERO, "") == false
       
   200 matcher(ONE | CHAR('a'), "") == true
       
   201 matcher(ONE | CHAR('a'), "a") == true
       
   202 */
   126 
   203 
   127 // (8) 
   204 // (8) 
   128 
       
   129 def size(r: Rexp): Int = r match {
   205 def size(r: Rexp): Int = r match {
   130   case ZERO => 1
   206   case ZERO     => 1
   131   case ONE => 1
   207   case ONE      => 1
   132   case CHAR(_) => 1
   208   case CHAR(_)  => 1
   133   case ALTs(rs) => 1 + rs.map(size).sum
   209   case ALTs(rs) => 1 + (for(reg <- rs) yield size(reg)).sum
   134   case SEQs(rs) => 1 + rs.map(size).sum
   210   case SEQs(rs) => 1 + (for(reg <- rs) yield size(reg)).sum
   135   case STAR(r1) => 1 + size(r1)
   211   case STAR(r)  => 1 + size(r)
   136 }
   212 }
   137 
   213 
   138 
   214 /*
   139 
   215 val EVIL = SEQ(STAR(STAR(CHAR('a'))), CHAR('b'))
   140 // some testing data
   216 size(der('a', der('a', EVIL))) == 36
   141 /*
   217 size(der('a', der('a', der('a', EVIL)))) == 83
   142 println(matcher(("a" ~ "b") ~ "c", "abc"))  // => true
   218 size(ders("aaaaaa".toList, EVIL)) == 7
   143 println(matcher(("a" ~ "b") ~ "c", "ab"))   // => false
   219 size(ders(("a" * 50).toList, EVIL)) == 7
       
   220 */
       
   221 
       
   222 
       
   223 // Some testing data
       
   224 //===================
       
   225 /*
       
   226 
       
   227 simp(ALT(ONE | CHAR('a'), CHAR('a') | ONE))   // => ALTs(List(ONE, CHAR(a)))
       
   228 simp(((CHAR('a') | ZERO) ~ ONE) | (((ONE | CHAR('b')) | CHAR('c')) ~ (CHAR('d') ~ ZERO)))   // => CHAR(a)
       
   229 
       
   230 matcher(("a" ~ "b") ~ "c", "ab")   // => false
       
   231 matcher(("a" ~ "b") ~ "c", "abc")  // => true
       
   232 
   144 
   233 
   145 // the supposedly 'evil' regular expression (a*)* b
   234 // the supposedly 'evil' regular expression (a*)* b
   146 val EVIL = SEQ(STAR(STAR(CHAR('a'))), CHAR('b'))
   235 val EVIL = SEQ(STAR(STAR(CHAR('a'))), CHAR('b'))
   147 
   236 
   148 println(matcher(EVIL, "a" * 1000 ++ "b"))   // => true
   237 matcher(EVIL, "a" * 1000)          // => false
   149 println(matcher(EVIL, "a" * 1000))          // => false
   238 matcher(EVIL, "a" * 1000 ++ "b")   // => true
       
   239 
   150 
   240 
   151 // size without simplifications
   241 // size without simplifications
   152 println(size(der('a', der('a', EVIL))))             // => 36
   242 size(der('a', der('a', EVIL)))             // => 36
   153 println(size(der('a', der('a', der('a', EVIL)))))   // => 83
   243 size(der('a', der('a', der('a', EVIL))))   // => 83
   154 
   244 
   155 // size with simplification
   245 // size with simplification
   156 println(simp(der('a', der('a', EVIL))))          
   246 size(simp(der('a', der('a', EVIL))))           // => 7
   157 println(simp(der('a', der('a', der('a', EVIL)))))
   247 size(simp(der('a', der('a', der('a', EVIL))))) // => 7
   158 
       
   159 println(size(simp(der('a', der('a', EVIL)))))           // => 7
       
   160 println(size(simp(der('a', der('a', der('a', EVIL)))))) // => 7
       
   161 
   248 
   162 // Python needs around 30 seconds for matching 28 a's with EVIL. 
   249 // Python needs around 30 seconds for matching 28 a's with EVIL. 
   163 // Java 9 and later increase this to an "astonishing" 40000 a's in
   250 // Java 9 and later increase this to an "astonishing" 40000 a's in
   164 // around 30 seconds.
   251 // 30 seconds.
   165 //
   252 //
   166 // Lets see how long it takes to match strings with 
   253 // Lets see how long it really takes to match strings with 
   167 // 5 Million a's...it should be in the range of a 
   254 // 5 Million a's...it should be in the range of a few
   168 // few seconds.
   255 // of seconds.
   169 
   256 
   170 def time_needed[T](i: Int, code: => T) = {
   257 def time_needed[T](i: Int, code: => T) = {
   171   val start = System.nanoTime()
   258   val start = System.nanoTime()
   172   for (j <- 1 to i) code
   259   for (j <- 1 to i) code
   173   val end = System.nanoTime()
   260   val end = System.nanoTime()
   177 for (i <- 0 to 5000000 by 500000) {
   264 for (i <- 0 to 5000000 by 500000) {
   178   println(s"$i ${time_needed(2, matcher(EVIL, "a" * i))} secs.") 
   265   println(s"$i ${time_needed(2, matcher(EVIL, "a" * i))} secs.") 
   179 }
   266 }
   180 
   267 
   181 // another "power" test case 
   268 // another "power" test case 
   182 println(simp(Iterator.iterate(ONE:Rexp)(r => SEQ(r, ONE | ONE)).drop(100).next()) == ONE)
   269 simp(Iterator.iterate(ONE:Rexp)(r => SEQ(r, ONE | ONE)).drop(50).next()) == ONE
   183 
   270 
   184 // the Iterator produces the rexp
   271 // the Iterator produces the rexp
   185 //
   272 //
   186 //      SEQ(SEQ(SEQ(..., ONE | ONE) , ONE | ONE), ONE | ONE)
   273 //      SEQ(SEQ(SEQ(..., ONE | ONE) , ONE | ONE), ONE | ONE)
   187 //
   274 //
   188 //    where SEQ is nested 100 times.
   275 //    where SEQ is nested 50 times.
   189 */ 
   276 
   190 
   277 */
   191 
   278 
   192 assert(simp(ZERO | ONE) == ONE)
   279 }
   193 assert(simp(STAR(ZERO | ONE)) == STAR(ZERO | ONE))
   280 
   194 assert(simp(ONE ~ (ONE ~ (ONE ~ CHAR('a')))) == CHAR('a'))
   281 
   195 assert(simp(((ONE ~ ONE) ~ ONE) ~ CHAR('a')) == CHAR('a'))
   282 
   196 assert(simp(((ONE | ONE) ~ ONE) ~ CHAR('a')) == CHAR('a'))
   283 
   197 assert(simp(ONE ~ (ONE ~ (ONE ~ ZERO))) == ZERO)
   284 
   198 assert(simp(ALT(ONE ~ (ONE ~ (ONE ~ ZERO)), CHAR('a'))) == CHAR('a'))
   285 
   199 assert(simp(CHAR('a') | CHAR('a')) == CHAR('a'))
   286 // This template code is subject to copyright 
   200 assert(simp(CHAR('a') ~ CHAR('a')) == CHAR('a') ~ CHAR('a'))
   287 // by King's College London, 2022. Do not 
   201 assert(simp(ONE | CHAR('a')) == (ONE | CHAR('a')))
   288 // make the template code public in any shape 
   202 assert(simp(ALT((CHAR('a') | ZERO) ~ ONE,
   289 // or form, and do not exchange it with other 
   203                   ((ONE | CHAR('b')) | CHAR('c')) ~ (CHAR('d') ~ ZERO))) == CHAR('a'))
   290 // students under any circumstance.
   204 assert(simp((ZERO | ((ZERO | ZERO) | (ZERO | ZERO))) ~ ((ONE | ZERO) | ONE ) ~ (CHAR('a'))) == ZERO)
       
   205 assert(simp(ALT(ONE | ONE, ONE | ONE)) == ONE)
       
   206 assert(simp(ALT(ZERO | CHAR('a'), CHAR('a') | ZERO)) == CHAR('a'))
       
   207 assert(simp(ALT(ONE | CHAR('a'), CHAR('a') | ONE)) == ALT(ONE, CHAR('a')))
       
   208 assert(simp(ALTs(Nil)) == ZERO)
       
   209 assert(simp(SEQs(List(CHAR('a')))) == CHAR('a'))
       
   210 
       
   211 
       
   212 }
       
   213