| 417 |      1 | // Main Part 3 about Regular Expression Matching
 | 
| 430 |      2 | //==============================================
 | 
| 153 |      3 | 
 | 
| 403 |      4 | object M3 {
 | 
| 249 |      5 | 
 | 
| 153 |      6 | abstract class Rexp
 | 
|  |      7 | case object ZERO extends Rexp
 | 
|  |      8 | case object ONE extends Rexp
 | 
|  |      9 | case class CHAR(c: Char) extends Rexp
 | 
| 430 |     10 | case class ALTs(rs: List[Rexp]) extends Rexp  // alternatives 
 | 
|  |     11 | case class SEQs(rs: List[Rexp]) extends Rexp  // sequences
 | 
|  |     12 | case class STAR(r: Rexp) extends Rexp         // star
 | 
| 153 |     13 | 
 | 
|  |     14 | 
 | 
| 430 |     15 | //the usual binary choice and binary sequence can be defined 
 | 
|  |     16 | //in terms of ALTs and SEQs
 | 
| 403 |     17 | def ALT(r1: Rexp, r2: Rexp) = ALTs(List(r1, r2))
 | 
| 430 |     18 | def SEQ(r1: Rexp, r2: Rexp) = SEQs(List(r1, r2))
 | 
| 403 |     19 | 
 | 
| 460 |     20 | 
 | 
|  |     21 | // some convenience for typing regular expressions
 | 
| 229 |     22 | 
 | 
| 153 |     23 | def charlist2rexp(s: List[Char]): Rexp = s match {
 | 
|  |     24 |   case Nil => ONE
 | 
|  |     25 |   case c::Nil => CHAR(c)
 | 
|  |     26 |   case c::s => SEQ(CHAR(c), charlist2rexp(s))
 | 
|  |     27 | }
 | 
| 472 |     28 | 
 | 
|  |     29 | import scala.language.implicitConversions
 | 
| 153 |     30 | 
 | 
| 472 |     31 | given Conversion[String, Rexp] = (s => charlist2rexp(s.toList))
 | 
|  |     32 | 
 | 
|  |     33 | extension (r: Rexp) {
 | 
| 153 |     34 |   def | (s: Rexp) = ALT(r, s)
 | 
|  |     35 |   def % = STAR(r)
 | 
|  |     36 |   def ~ (s: Rexp) = SEQ(r, s)
 | 
|  |     37 | }
 | 
|  |     38 | 
 | 
| 472 |     39 | // some examples for the conversion and extension:
 | 
| 153 |     40 | 
 | 
| 460 |     41 | // val areg : Rexp = "a" | "b"
 | 
| 472 |     42 | //  => ALTs(List(CHAR('a'), CHAR('b')))
 | 
|  |     43 | //
 | 
| 460 |     44 | // val sreg : Rexp = "a" ~ "b"
 | 
| 472 |     45 | //  => SEQs(List(CHAR('a'), CHAR('b'))) 
 | 
|  |     46 | //
 | 
|  |     47 | // val star_reg : Rexp = ("a" ~ "b").%
 | 
|  |     48 | //  => STAR(SEQs(List(CHAR('a'), CHAR('b')))) 
 | 
| 460 |     49 | 
 | 
|  |     50 | // ADD YOUR CODE BELOW
 | 
|  |     51 | //======================
 | 
|  |     52 | 
 | 
| 472 |     53 | // (1) 
 | 
| 347 |     54 | def nullable (r: Rexp) : Boolean = r match {
 | 
| 472 |     55 |   case ZERO => false
 | 
|  |     56 |   case ONE => true
 | 
|  |     57 |   case CHAR(_) => false
 | 
|  |     58 |   case ALTs(rs) => rs.exists(nullable)
 | 
|  |     59 |   case SEQs(rs) => rs.forall(nullable)
 | 
|  |     60 |   case STAR(_) => true
 | 
| 153 |     61 | }
 | 
|  |     62 | 
 | 
| 430 |     63 | // (2) 
 | 
| 472 |     64 | def der(c: Char, r: Rexp) : Rexp = r match {
 | 
|  |     65 |   case ZERO => ZERO
 | 
|  |     66 |   case ONE => ZERO
 | 
|  |     67 |   case CHAR(d) => if (c == d) ONE else ZERO
 | 
|  |     68 |   case ALTs(rs) => ALTs(rs.map(der(c, _)))
 | 
|  |     69 |   case SEQs(Nil) => ZERO
 | 
|  |     70 |   case SEQs(r1::rs) => 
 | 
|  |     71 |     if (nullable(r1)) ALT(SEQs(der(c, r1)::rs), der(c, SEQs(rs)))
 | 
|  |     72 |     else SEQs(der(c, r1) :: rs)
 | 
|  |     73 |   case STAR(r1) => SEQ(der(c, r1), STAR(r1))
 | 
| 417 |     74 | }
 | 
|  |     75 | 
 | 
|  |     76 | 
 | 
| 430 |     77 | // (3) 
 | 
|  |     78 | def denest(rs: List[Rexp]) : List[Rexp] = rs match {
 | 
| 472 |     79 |   case Nil => Nil
 | 
|  |     80 |   case ZERO::tl => denest(tl)
 | 
|  |     81 |   case ALTs(rs1)::rs2 => rs1 ::: denest(rs2)  
 | 
|  |     82 |   case r::rs => r :: denest(rs) 
 | 
| 430 |     83 | }
 | 
| 417 |     84 | 
 | 
| 430 |     85 | // (4)
 | 
|  |     86 | def flts(rs: List[Rexp], acc: List[Rexp] = Nil) : List[Rexp] = rs match {
 | 
| 472 |     87 |   case Nil => acc
 | 
|  |     88 |   case ZERO::rs => ZERO::Nil
 | 
|  |     89 |   case ONE::rs => flts(rs, acc)
 | 
|  |     90 |   case SEQs(rs1)::rs => flts(rs, acc ::: rs1)
 | 
|  |     91 |   case r::rs => flts(rs, acc :+ r) 
 | 
| 153 |     92 | }
 | 
|  |     93 | 
 | 
| 430 |     94 | // (5)
 | 
|  |     95 | def ALTs_smart(rs: List[Rexp]) : Rexp = rs match {
 | 
| 472 |     96 |   case Nil => ZERO
 | 
|  |     97 |   case r::Nil => r  
 | 
|  |     98 |   case rs => ALTs(rs)
 | 
| 430 |     99 | }
 | 
| 403 |    100 | 
 | 
| 430 |    101 | def SEQs_smart(rs: List[Rexp]) : Rexp = rs match {
 | 
| 472 |    102 |   case Nil => ONE
 | 
|  |    103 |   case ZERO::Nil => ZERO
 | 
|  |    104 |   case r::Nil => r
 | 
|  |    105 |   case rs => SEQs(rs) 
 | 
| 460 |    106 | }
 | 
|  |    107 | 
 | 
| 472 |    108 | // (6) 
 | 
| 460 |    109 | 
 | 
|  |    110 | def simp(r: Rexp) : Rexp = r match {
 | 
| 472 |    111 |   case ALTs(rs) => 
 | 
|  |    112 |     ALTs_smart(denest(rs.map(simp)).distinct)
 | 
|  |    113 |   case SEQs(rs) => 
 | 
|  |    114 |     SEQs_smart(flts(rs.map(simp)))
 | 
|  |    115 |   case r => r
 | 
| 430 |    116 | }
 | 
| 421 |    117 | 
 | 
| 472 |    118 | //println("Simp tests")
 | 
|  |    119 | //println(simp(ALT(ONE | CHAR('a'), CHAR('a') | ONE)))
 | 
|  |    120 | //println(simp(((CHAR('a') | ZERO) ~ ONE) | 
 | 
|  |    121 | //              (((ONE | CHAR('b')) | CHAR('c')) ~ (CHAR('d') ~ ZERO))))
 | 
| 403 |    122 | 
 | 
| 472 |    123 | // (7) 
 | 
|  |    124 | 
 | 
| 460 |    125 | def ders (s: List[Char], r: Rexp) : Rexp = s match {
 | 
| 472 |    126 |   case Nil => r
 | 
|  |    127 |   case c::s => ders(s, simp(der(c, r)))
 | 
| 153 |    128 | }
 | 
|  |    129 | 
 | 
| 472 |    130 | // main matcher function
 | 
|  |    131 | def matcher(r: Rexp, s: String) = nullable(ders(s.toList, r))
 | 
| 460 |    132 | 
 | 
|  |    133 | // (8) 
 | 
| 472 |    134 | 
 | 
| 460 |    135 | def size(r: Rexp): Int = r match {
 | 
| 472 |    136 |   case ZERO => 1
 | 
|  |    137 |   case ONE => 1
 | 
|  |    138 |   case CHAR(_) => 1
 | 
|  |    139 |   case ALTs(rs) => 1 + rs.map(size).sum
 | 
|  |    140 |   case SEQs(rs) => 1 + rs.map(size).sum
 | 
|  |    141 |   case STAR(r1) => 1 + size(r1)
 | 
| 460 |    142 | }
 | 
|  |    143 | 
 | 
| 221 |    144 | 
 | 
| 430 |    145 | 
 | 
| 472 |    146 | // some testing data
 | 
| 460 |    147 | /*
 | 
| 472 |    148 | println(matcher(("a" ~ "b") ~ "c", "abc"))  // => true
 | 
|  |    149 | println(matcher(("a" ~ "b") ~ "c", "ab"))   // => false
 | 
| 229 |    150 | 
 | 
|  |    151 | // the supposedly 'evil' regular expression (a*)* b
 | 
| 430 |    152 | val EVIL = SEQ(STAR(STAR(CHAR('a'))), CHAR('b'))
 | 
| 229 |    153 | 
 | 
| 472 |    154 | println(matcher(EVIL, "a" * 1000 ++ "b"))   // => true
 | 
|  |    155 | println(matcher(EVIL, "a" * 1000))          // => false
 | 
| 153 |    156 | 
 | 
|  |    157 | // size without simplifications
 | 
| 472 |    158 | println(size(der('a', der('a', EVIL))))             // => 36
 | 
|  |    159 | println(size(der('a', der('a', der('a', EVIL)))))   // => 83
 | 
| 153 |    160 | 
 | 
|  |    161 | // size with simplification
 | 
| 472 |    162 | println(simp(der('a', der('a', EVIL))))          
 | 
|  |    163 | println(simp(der('a', der('a', der('a', EVIL)))))
 | 
|  |    164 | 
 | 
|  |    165 | println(size(simp(der('a', der('a', EVIL)))))           // => 7
 | 
|  |    166 | println(size(simp(der('a', der('a', der('a', EVIL)))))) // => 7
 | 
| 228 |    167 | 
 | 
| 229 |    168 | // Python needs around 30 seconds for matching 28 a's with EVIL. 
 | 
| 221 |    169 | // Java 9 and later increase this to an "astonishing" 40000 a's in
 | 
| 472 |    170 | // around 30 seconds.
 | 
| 153 |    171 | //
 | 
| 472 |    172 | // Lets see how long it takes to match strings with 
 | 
|  |    173 | // 5 Million a's...it should be in the range of a 
 | 
|  |    174 | // few seconds.
 | 
| 153 |    175 | 
 | 
| 421 |    176 | def time_needed[T](i: Int, code: => T) = {
 | 
|  |    177 |   val start = System.nanoTime()
 | 
|  |    178 |   for (j <- 1 to i) code
 | 
|  |    179 |   val end = System.nanoTime()
 | 
|  |    180 |   "%.5f".format((end - start)/(i * 1.0e9))
 | 
|  |    181 | }
 | 
| 153 |    182 | 
 | 
| 430 |    183 | for (i <- 0 to 5000000 by 500000) {
 | 
|  |    184 |   println(s"$i ${time_needed(2, matcher(EVIL, "a" * i))} secs.") 
 | 
|  |    185 | }
 | 
| 221 |    186 | 
 | 
| 229 |    187 | // another "power" test case 
 | 
| 472 |    188 | println(simp(Iterator.iterate(ONE:Rexp)(r => SEQ(r, ONE | ONE)).drop(100).next()) == ONE)
 | 
| 221 |    189 | 
 | 
|  |    190 | // the Iterator produces the rexp
 | 
|  |    191 | //
 | 
|  |    192 | //      SEQ(SEQ(SEQ(..., ONE | ONE) , ONE | ONE), ONE | ONE)
 | 
|  |    193 | //
 | 
| 472 |    194 | //    where SEQ is nested 100 times.
 | 
|  |    195 | */ 
 | 
| 228 |    196 | 
 | 
| 452 |    197 | 
 | 
| 300 |    198 | }
 | 
| 430 |    199 | 
 | 
| 460 |    200 | 
 | 
|  |    201 | 
 | 
|  |    202 | 
 | 
|  |    203 | 
 | 
|  |    204 | 
 | 
|  |    205 | // This template code is subject to copyright 
 | 
|  |    206 | // by King's College London, 2022. Do not 
 | 
|  |    207 | // make the template code public in any shape 
 | 
|  |    208 | // or form, and do not exchange it with other 
 | 
|  |    209 | // students under any circumstance.
 |