main_testing3/re.scala
changeset 475 59e005dcf163
parent 463 0315d9983cd0
equal deleted inserted replaced
474:b528d1d3d3c3 475:59e005dcf163
    17 def ALT(r1: Rexp, r2: Rexp) = ALTs(List(r1, r2))
    17 def ALT(r1: Rexp, r2: Rexp) = ALTs(List(r1, r2))
    18 def SEQ(r1: Rexp, r2: Rexp) = SEQs(List(r1, r2))
    18 def SEQ(r1: Rexp, r2: Rexp) = SEQs(List(r1, r2))
    19 
    19 
    20 
    20 
    21 // some convenience for typing regular expressions
    21 // some convenience for typing regular expressions
    22 import scala.language.implicitConversions    
       
    23 import scala.language.reflectiveCalls 
       
    24 
    22 
    25 def charlist2rexp(s: List[Char]): Rexp = s match {
    23 def charlist2rexp(s: List[Char]): Rexp = s match {
    26   case Nil => ONE
    24   case Nil => ONE
    27   case c::Nil => CHAR(c)
    25   case c::Nil => CHAR(c)
    28   case c::s => SEQ(CHAR(c), charlist2rexp(s))
    26   case c::s => SEQ(CHAR(c), charlist2rexp(s))
    29 }
    27 }
    30 implicit def string2rexp(s: String): Rexp = charlist2rexp(s.toList)
    28 
    31 
    29 import scala.language.implicitConversions
    32 implicit def RexpOps (r: Rexp) = new {
    30 
       
    31 given Conversion[String, Rexp] = (s => charlist2rexp(s.toList))
       
    32 
       
    33 extension (r: Rexp) {
    33   def | (s: Rexp) = ALT(r, s)
    34   def | (s: Rexp) = ALT(r, s)
    34   def % = STAR(r)
    35   def % = STAR(r)
    35   def ~ (s: Rexp) = SEQ(r, s)
    36   def ~ (s: Rexp) = SEQ(r, s)
    36 }
    37 }
    37 
    38 
    38 implicit def stringOps (s: String) = new {
    39 // some examples for the conversion and extension:
    39   def | (r: Rexp) = ALT(s, r)
    40 
    40   def | (r: String) = ALT(s, r)
       
    41   def % = STAR(s)
       
    42   def ~ (r: Rexp) = SEQ(s, r)
       
    43   def ~ (r: String) = SEQ(s, r)
       
    44 }
       
    45 
       
    46 // examples for the implicits:
       
    47 // ALT(CHAR('a'), CHAR('b'))
       
    48 // val areg : Rexp = "a" | "b"
    41 // val areg : Rexp = "a" | "b"
    49 
    42 //  => ALTs(List(CHAR('a'), CHAR('b')))
    50 // SEQ(CHAR('a'), CHAR('b')) 
    43 //
    51 // val sreg : Rexp = "a" ~ "b"
    44 // val sreg : Rexp = "a" ~ "b"
    52 
    45 //  => SEQs(List(CHAR('a'), CHAR('b'))) 
       
    46 //
       
    47 // val star_reg : Rexp = ("a" ~ "b").%
       
    48 //  => STAR(SEQs(List(CHAR('a'), CHAR('b')))) 
    53 
    49 
    54 // ADD YOUR CODE BELOW
    50 // ADD YOUR CODE BELOW
    55 //======================
    51 //======================
    56 
    52 
    57 // (1)
    53 // (1) 
    58 def nullable (r: Rexp) : Boolean = r match {
    54 def nullable (r: Rexp) : Boolean = r match {
    59   case ZERO     => false
    55   case ZERO => false
    60   case ONE      => true
    56   case ONE => true
    61   case CHAR(_)  => false
    57   case CHAR(_) => false
    62   case ALTs(rs) => (for(reg <- rs) yield nullable(reg)).exists(_ == true)
    58   case ALTs(rs) => rs.exists(nullable)
    63   case SEQs(rs) => (for(reg <- rs) yield nullable(reg)).forall(_ == true)
    59   case SEQs(rs) => rs.forall(nullable)
    64   case STAR(_)  => true
    60   case STAR(_) => true
    65 }
    61 }
    66 
       
    67 /*
       
    68 nullable(ZERO) == false
       
    69 nullable(ONE) == true
       
    70 nullable(CHAR('a')) == false
       
    71 nullable(ZERO | ONE) == true
       
    72 nullable(ZERO | CHAR('a')) == false
       
    73 nullable(ONE ~ ONE) == true
       
    74 nullable(ONE ~ CHAR('a')) == false
       
    75 nullable(STAR(ZERO)) == true
       
    76 nullable(ALTs(List(ONE, CHAR('a'), ZERO))) == true
       
    77 nullable(SEQs(List(ONE, ALTs(List(ONE, CHAR('a'), ZERO)), STAR(ZERO)))) == true
       
    78 */
       
    79 
    62 
    80 // (2) 
    63 // (2) 
    81 def der (c: Char, r: Rexp) : Rexp = r match {
    64 def der(c: Char, r: Rexp) : Rexp = r match {
    82   case ZERO           => ZERO
    65   case ZERO => ZERO
    83   case ONE            => ZERO
    66   case ONE => ZERO
    84   case CHAR(d)        => if(c == d) ONE else ZERO
    67   case CHAR(d) => if (c == d) ONE else ZERO
    85   case ALTs(rs)       => ALTs(for(reg <- rs) yield der(c, reg))
    68   case ALTs(rs) => ALTs(rs.map(der(c, _)))
    86   case SEQs(Nil)      => ZERO
    69   case SEQs(Nil) => ZERO
    87   case SEQs(r :: rs)  => if(nullable(r)) ALT(SEQs(der(c, r) :: rs), der(c, SEQs(rs))) else SEQs(der(c, r) :: rs)
    70   case SEQs(r1::rs) => 
    88   case STAR(r)        => SEQ(der(c,r), STAR(r))
    71     if (nullable(r1)) ALT(SEQs(der(c, r1)::rs), der(c, SEQs(rs)))
    89 }
    72     else SEQs(der(c, r1) :: rs)
    90 
    73   case STAR(r1) => SEQ(der(c, r1), STAR(r1))
    91 /*
    74 }
    92 der('a', ZERO | ONE) == (ZERO | ZERO)
    75 
    93 der('a', (CHAR('a') | ONE) ~ CHAR('a')) == ALT((ONE | ZERO) ~ CHAR('a'), SEQs(List(ONE)))
       
    94 der('a', (CHAR('a') | CHAR('a')) ~ CHAR('a')) == (ONE | ONE) ~ CHAR('a')
       
    95 der('a', STAR(CHAR('a'))) == (ONE ~ STAR(CHAR('a')))
       
    96 der('b', STAR(CHAR('a'))) == (ZERO ~ STAR(CHAR('a')))
       
    97 */
       
    98 
    76 
    99 // (3) 
    77 // (3) 
   100 def denest(rs: List[Rexp]) : List[Rexp] = rs match {
    78 def denest(rs: List[Rexp]) : List[Rexp] = rs match {
   101   case Nil                => Nil
    79   case Nil => Nil
   102   case ZERO :: rest       => denest(rest)
    80   case ZERO::tl => denest(tl)
   103   case ALTs(rgs) :: rest  => rgs ::: denest(rest)
    81   case ALTs(rs1)::rs2 => rs1 ::: denest(rs2)  
   104   case r :: rest          => r :: denest(rest)
    82   case r::rs => r :: denest(rs) 
   105 }
    83 }
   106 
       
   107 /*
       
   108 denest(List(ONE, ZERO, ALTs(List(ONE, CHAR('a'))))) == List(ONE, ONE, CHAR('a'))
       
   109 denest(List(ONE ~ ONE, ZERO, ZERO | ONE)) == List(ONE ~ ONE, ZERO, ONE)
       
   110 */
       
   111 
    84 
   112 // (4)
    85 // (4)
   113 def flts(rs: List[Rexp], acc: List[Rexp] = Nil) : List[Rexp] = rs match {
    86 def flts(rs: List[Rexp], acc: List[Rexp] = Nil) : List[Rexp] = rs match {
   114   case Nil                => acc
    87   case Nil => acc
   115   case ZERO :: rest       => List(ZERO)
    88   case ZERO::rs => ZERO::Nil
   116   case ONE :: rest        => flts(rest, acc)
    89   case ONE::rs => flts(rs, acc)
   117   case SEQs(rgs) :: rest  => flts(rest, acc ::: rgs)
    90   case SEQs(rs1)::rs => flts(rs, acc ::: rs1)
   118   case r :: rest          => flts(rest, acc ::: List(r)) 
    91   case r::rs => flts(rs, acc :+ r) 
   119 }
    92 }
   120 
       
   121 /*
       
   122 flts(List(CHAR('a'), ZERO, ONE), Nil) == List(ZERO)
       
   123 flts(List(CHAR('a'), ONE, ONE, CHAR('b')), Nil) == List(CHAR('a'), CHAR('b'))
       
   124 flts(List(ONE ~ CHAR('a'), CHAR('b') ~ ONE), Nil) == List(ONE, CHAR('a'), CHAR('b'), ONE)
       
   125 */
       
   126 
    93 
   127 // (5)
    94 // (5)
   128 def ALTs_smart(rs: List[Rexp]) : Rexp = rs match {
    95 def ALTs_smart(rs: List[Rexp]) : Rexp = rs match {
   129   case Nil      => ZERO
    96   case Nil => ZERO
   130   case List(r)  => r
    97   case r::Nil => r  
   131   case _        => ALTs(rs)
    98   case rs => ALTs(rs)
   132 }
    99 }
   133 
   100 
   134 def SEQs_smart(rs: List[Rexp]) : Rexp = rs match {
   101 def SEQs_smart(rs: List[Rexp]) : Rexp = rs match {
   135   case Nil      => ONE
   102   case Nil => ONE
   136   case List(r)  => r
   103   case ZERO::Nil => ZERO
   137   case _        => SEQs(rs)
   104   case r::Nil => r
   138 }
   105   case rs => SEQs(rs) 
   139 
   106 }
       
   107 
       
   108 // (6) 
       
   109 
       
   110 def simp(r: Rexp) : Rexp = r match {
       
   111   case ALTs(rs) => 
       
   112     ALTs_smart(denest(rs.map(simp)).distinct)
       
   113   case SEQs(rs) => 
       
   114     SEQs_smart(flts(rs.map(simp)))
       
   115   case r => r
       
   116 }
       
   117 
       
   118 //println("Simp tests")
       
   119 //println(simp(ALT(ONE | CHAR('a'), CHAR('a') | ONE)))
       
   120 //println(simp(((CHAR('a') | ZERO) ~ ONE) | 
       
   121 //              (((ONE | CHAR('b')) | CHAR('c')) ~ (CHAR('d') ~ ZERO))))
       
   122 
       
   123 // (7) 
       
   124 
       
   125 def ders (s: List[Char], r: Rexp) : Rexp = s match {
       
   126   case Nil => r
       
   127   case c::s => ders(s, simp(der(c, r)))
       
   128 }
       
   129 
       
   130 // main matcher function
       
   131 def matcher(r: Rexp, s: String) = nullable(ders(s.toList, r))
       
   132 
       
   133 // (8) 
       
   134 
       
   135 def size(r: Rexp): Int = r match {
       
   136   case ZERO => 1
       
   137   case ONE => 1
       
   138   case CHAR(_) => 1
       
   139   case ALTs(rs) => 1 + rs.map(size).sum
       
   140   case SEQs(rs) => 1 + rs.map(size).sum
       
   141   case STAR(r1) => 1 + size(r1)
       
   142 }
       
   143 
       
   144 
       
   145 
       
   146 // some testing data
   140 /*
   147 /*
   141 SEQs_smart(Nil) == ONE
   148 println(matcher(("a" ~ "b") ~ "c", "abc"))  // => true
   142 SEQs_smart(List(ZERO)) == ZERO
   149 println(matcher(("a" ~ "b") ~ "c", "ab"))   // => false
   143 SEQs_smart(List(CHAR('a'))) == CHAR('a')
       
   144 SEQs_smart(List(ONE ~ ONE)) == ONE ~ ONE
       
   145 SEQs_smart(List(ONE, ONE)) == SEQs(List(ONE, ONE))
       
   146 ALTs_smart(Nil) == ZERO
       
   147 ALTs_smart(List(ONE ~ ONE)) == ONE ~ ONE
       
   148 ALTs_smart(List(ZERO, ZERO)) == ALTs(List(ZERO, ZERO))
       
   149 */
       
   150 
       
   151 // (6)
       
   152 def simp(r: Rexp) : Rexp = r match {
       
   153   case ALTs(rs) => ALTs_smart(denest(for(reg <- rs) yield simp(reg)).distinct)
       
   154   case SEQs(rs) => SEQs_smart(flts(for(reg <- rs) yield simp(reg)))
       
   155   case _        => r
       
   156 }
       
   157 
       
   158 /*
       
   159 simp(ZERO | ONE) == ONE
       
   160 simp(STAR(ZERO | ONE)) == STAR(ZERO | ONE)
       
   161 simp(ONE ~ (ONE ~ (ONE ~ CHAR('a')))) == CHAR('a')
       
   162 simp(((ONE ~ ONE) ~ ONE) ~ CHAR('a')) == CHAR('a')
       
   163 simp(((ONE | ONE) ~ ONE) ~ CHAR('a')) == CHAR('a')
       
   164 simp(ONE ~ (ONE ~ (ONE ~ ZERO))) == ZERO
       
   165 simp(ALT(ONE ~ (ONE ~ (ONE ~ ZERO)), CHAR('a'))) == CHAR('a')
       
   166 simp(CHAR('a') | CHAR('a')) == CHAR('a')
       
   167 simp(CHAR('a') ~ CHAR('a')) == CHAR('a') ~ CHAR('a')
       
   168 simp(ONE | CHAR('a')) == (ONE | CHAR('a'))
       
   169 simp(ALT((CHAR('a') | ZERO) ~ ONE,((ONE | CHAR('b')) | CHAR('c')) ~ (CHAR('d') ~ ZERO))) == CHAR('a')
       
   170 simp((ZERO | ((ZERO | ZERO) | (ZERO | ZERO))) ~ ((ONE | ZERO) | ONE ) ~ (CHAR('a'))) == ZERO
       
   171 simp(ALT(ONE | ONE, ONE | ONE)) == ONE
       
   172 simp(ALT(ZERO | CHAR('a'), CHAR('a') | ZERO)) == CHAR('a')
       
   173 simp(ALT(ONE | CHAR('a'), CHAR('a') | ONE)) == ALT(ONE, CHAR('a'))
       
   174 simp(ALTs(Nil)) == ZERO
       
   175 simp(SEQs(List(CHAR('a')))) == CHAR('a')
       
   176 */
       
   177 
       
   178 // (7)
       
   179 def ders (s: List[Char], r: Rexp) : Rexp = s match {
       
   180   case Nil      => r
       
   181   case c :: cs  => ders(cs, simp(der(c, r)))
       
   182 }
       
   183 def matcher(r: Rexp, s: String): Boolean = {
       
   184   val derivatives = ders(s.toList, r)
       
   185   nullable(derivatives)
       
   186 }
       
   187 
       
   188 /*
       
   189 val EVIL = SEQ(STAR(STAR(CHAR('a'))), CHAR('b'))
       
   190 ders("aaaaa".toList, EVIL) == SEQs(List(STAR(CHAR('a')), STAR(STAR(CHAR('a'))), CHAR('b')))
       
   191 ders(List('b'), EVIL) == ONE
       
   192 ders("bb".toList, EVIL) == ZERO
       
   193 matcher(EVIL, "a" * 5 ++ "b") == true
       
   194 matcher(EVIL, "b") == true
       
   195 matcher(EVIL, "bb") == false
       
   196 matcher("abc", "abc") == true
       
   197 matcher(("ab" | "a") ~ (ONE | "bc"), "abc") == true
       
   198 matcher(ONE, "") == true
       
   199 matcher(ZERO, "") == false
       
   200 matcher(ONE | CHAR('a'), "") == true
       
   201 matcher(ONE | CHAR('a'), "a") == true
       
   202 */
       
   203 
       
   204 // (8) 
       
   205 def size(r: Rexp): Int = r match {
       
   206   case ZERO     => 1
       
   207   case ONE      => 1
       
   208   case CHAR(_)  => 1
       
   209   case ALTs(rs) => 1 + (for(reg <- rs) yield size(reg)).sum
       
   210   case SEQs(rs) => 1 + (for(reg <- rs) yield size(reg)).sum
       
   211   case STAR(r)  => 1 + size(r)
       
   212 }
       
   213 
       
   214 /*
       
   215 val EVIL = SEQ(STAR(STAR(CHAR('a'))), CHAR('b'))
       
   216 size(der('a', der('a', EVIL))) == 36
       
   217 size(der('a', der('a', der('a', EVIL)))) == 83
       
   218 size(ders("aaaaaa".toList, EVIL)) == 7
       
   219 size(ders(("a" * 50).toList, EVIL)) == 7
       
   220 */
       
   221 
       
   222 
       
   223 // Some testing data
       
   224 //===================
       
   225 /*
       
   226 
       
   227 simp(ALT(ONE | CHAR('a'), CHAR('a') | ONE))   // => ALTs(List(ONE, CHAR(a)))
       
   228 simp(((CHAR('a') | ZERO) ~ ONE) | (((ONE | CHAR('b')) | CHAR('c')) ~ (CHAR('d') ~ ZERO)))   // => CHAR(a)
       
   229 
       
   230 matcher(("a" ~ "b") ~ "c", "ab")   // => false
       
   231 matcher(("a" ~ "b") ~ "c", "abc")  // => true
       
   232 
       
   233 
   150 
   234 // the supposedly 'evil' regular expression (a*)* b
   151 // the supposedly 'evil' regular expression (a*)* b
   235 val EVIL = SEQ(STAR(STAR(CHAR('a'))), CHAR('b'))
   152 val EVIL = SEQ(STAR(STAR(CHAR('a'))), CHAR('b'))
   236 
   153 
   237 matcher(EVIL, "a" * 1000)          // => false
   154 println(matcher(EVIL, "a" * 1000 ++ "b"))   // => true
   238 matcher(EVIL, "a" * 1000 ++ "b")   // => true
   155 println(matcher(EVIL, "a" * 1000))          // => false
   239 
       
   240 
   156 
   241 // size without simplifications
   157 // size without simplifications
   242 size(der('a', der('a', EVIL)))             // => 36
   158 println(size(der('a', der('a', EVIL))))             // => 36
   243 size(der('a', der('a', der('a', EVIL))))   // => 83
   159 println(size(der('a', der('a', der('a', EVIL)))))   // => 83
   244 
   160 
   245 // size with simplification
   161 // size with simplification
   246 size(simp(der('a', der('a', EVIL))))           // => 7
   162 println(simp(der('a', der('a', EVIL))))          
   247 size(simp(der('a', der('a', der('a', EVIL))))) // => 7
   163 println(simp(der('a', der('a', der('a', EVIL)))))
       
   164 
       
   165 println(size(simp(der('a', der('a', EVIL)))))           // => 7
       
   166 println(size(simp(der('a', der('a', der('a', EVIL)))))) // => 7
   248 
   167 
   249 // Python needs around 30 seconds for matching 28 a's with EVIL. 
   168 // Python needs around 30 seconds for matching 28 a's with EVIL. 
   250 // Java 9 and later increase this to an "astonishing" 40000 a's in
   169 // Java 9 and later increase this to an "astonishing" 40000 a's in
   251 // 30 seconds.
   170 // around 30 seconds.
   252 //
   171 //
   253 // Lets see how long it really takes to match strings with 
   172 // Lets see how long it takes to match strings with 
   254 // 5 Million a's...it should be in the range of a few
   173 // 5 Million a's...it should be in the range of a 
   255 // of seconds.
   174 // few seconds.
   256 
   175 
   257 def time_needed[T](i: Int, code: => T) = {
   176 def time_needed[T](i: Int, code: => T) = {
   258   val start = System.nanoTime()
   177   val start = System.nanoTime()
   259   for (j <- 1 to i) code
   178   for (j <- 1 to i) code
   260   val end = System.nanoTime()
   179   val end = System.nanoTime()
   264 for (i <- 0 to 5000000 by 500000) {
   183 for (i <- 0 to 5000000 by 500000) {
   265   println(s"$i ${time_needed(2, matcher(EVIL, "a" * i))} secs.") 
   184   println(s"$i ${time_needed(2, matcher(EVIL, "a" * i))} secs.") 
   266 }
   185 }
   267 
   186 
   268 // another "power" test case 
   187 // another "power" test case 
   269 simp(Iterator.iterate(ONE:Rexp)(r => SEQ(r, ONE | ONE)).drop(50).next()) == ONE
   188 println(simp(Iterator.iterate(ONE:Rexp)(r => SEQ(r, ONE | ONE)).drop(100).next()) == ONE)
   270 
   189 
   271 // the Iterator produces the rexp
   190 // the Iterator produces the rexp
   272 //
   191 //
   273 //      SEQ(SEQ(SEQ(..., ONE | ONE) , ONE | ONE), ONE | ONE)
   192 //      SEQ(SEQ(SEQ(..., ONE | ONE) , ONE | ONE), ONE | ONE)
   274 //
   193 //
   275 //    where SEQ is nested 50 times.
   194 //    where SEQ is nested 100 times.
   276 
   195 */ 
   277 */
   196 
   278 
   197 
   279 }
   198 }
   280 
   199 
   281 
   200 
   282 
   201