testing4/re.scala
changeset 228 33c2655be47d
parent 221 9e7897f25e13
child 229 5549016ab10f
equal deleted inserted replaced
227:b5f3e814a710 228:33c2655be47d
     1 // Part 1 about Regular Expression Matching
     1 // Part 1 about Regular Expression Matching
     2 //==========================================
     2 //==========================================
     3 
       
     4 //object CW9a {
       
     5 
     3 
     6 // Regular Expressions
     4 // Regular Expressions
     7 abstract class Rexp
     5 abstract class Rexp
     8 case object ZERO extends Rexp
     6 case object ZERO extends Rexp
     9 case object ONE extends Rexp
     7 case object ONE extends Rexp
    10 case class CHAR(c: Char) extends Rexp
     8 case class CHAR(c: Char) extends Rexp
    11 case class ALT(r1: Rexp, r2: Rexp) extends Rexp 
     9 case class ALT(r1: Rexp, r2: Rexp) extends Rexp   // alternative
    12 case class SEQ(r1: Rexp, r2: Rexp) extends Rexp 
    10 case class SEQ(r1: Rexp, r2: Rexp) extends Rexp   // sequence
    13 case class STAR(r: Rexp) extends Rexp 
    11 case class STAR(r: Rexp) extends Rexp             // star
    14 
    12 
    15 // some convenience for typing in regular expressions
    13 
    16 
    14 // some convenience for typing regular expressions
    17 import scala.language.implicitConversions    
    15 
    18 import scala.language.reflectiveCalls 
    16 import scala.language.implicitConversions
    19 
    17 import scala.language.reflectiveCalls
    20 
    18 
    21 def charlist2rexp(s: List[Char]): Rexp = s match {
    19 def charlist2rexp(s: List[Char]): Rexp = s match {
    22   case Nil => ONE
    20   case Nil => ONE
    23   case c::Nil => CHAR(c)
    21   case c::Nil => CHAR(c)
    24   case c::s => SEQ(CHAR(c), charlist2rexp(s))
    22   case c::s => SEQ(CHAR(c), charlist2rexp(s))
    38   def ~ (r: Rexp) = SEQ(s, r)
    36   def ~ (r: Rexp) = SEQ(s, r)
    39   def ~ (r: String) = SEQ(s, r)
    37   def ~ (r: String) = SEQ(s, r)
    40 }
    38 }
    41 
    39 
    42 // (1) Complete the function nullable according to
    40 // (1) Complete the function nullable according to
    43 // the definition given in the coursework; this 
    41 // the definition given in the coursework; this
    44 // function checks whether a regular expression
    42 // function checks whether a regular expression
    45 // can match the empty string and Returns a boolean
    43 // can match the empty string and Returns a boolean
    46 // accordingly.
    44 // accordingly.
    47 
    45 
    48 def nullable (r: Rexp) : Boolean = r match {
    46 def nullable (r: Rexp) : Boolean = r match{
    49   case ZERO => false
    47   case ZERO => false
    50   case ONE => true
    48   case ONE => true
    51   case CHAR(_) => false
    49   case CHAR(_) => false
    52   case ALT(r1, r2) => nullable(r1) || nullable(r2)
    50   case ALT(a,b)=>nullable(a)||nullable(b)
    53   case SEQ(r1, r2) => nullable(r1) && nullable(r2)
    51   case SEQ(a,b) => nullable(a) && nullable(b)
    54   case STAR(_) => true
    52   case STAR(_) => true
    55 }
    53 }
       
    54 
       
    55 
       
    56 /*val rex = "1~0.%|11"
       
    57 
       
    58 assert(der('1',rex) == SEQ(ONE,SEQ(CHAR(~),SEQ(CHAR(0),SEQ(CHAR(.),SEQ(CHAR(%),SEQ(CHAR(|),SEQ(CHAR(1),CHAR(1)))))))))
       
    59 
       
    60 assert(der('1',der('1',rex)) ==
       
    61         ALT(SEQ(ZERO,SEQ(CHAR(~),SEQ(CHAR(0),SEQ(CHAR(.),SEQ(CHAR(%),SEQ(CHAR(|),
       
    62         SEQ(CHAR(1),CHAR(1)))))))),SEQ(ZERO,SEQ(CHAR(0),SEQ(CHAR(.),SEQ(CHAR(%),
       
    63         SEQ(CHAR(|),SEQ(CHAR(1),CHAR(1))))))))*/
    56 
    64 
    57 // (2) Complete the function der according to
    65 // (2) Complete the function der according to
    58 // the definition given in the coursework; this
    66 // the definition given in the coursework; this
    59 // function calculates the derivative of a 
    67 // function calculates the derivative of a
    60 // regular expression w.r.t. a character.
    68 // regular expression w.r.t. a character.
    61 
    69 
    62 def der (c: Char, r: Rexp) : Rexp = r match {
    70 def der (c: Char, r: Rexp) : Rexp = r match{
    63   case ZERO => ZERO
    71   case ZERO => ZERO
    64   case ONE => ZERO
    72   case ONE => ZERO
    65   case CHAR(d) => if (c == d) ONE else ZERO
    73   case CHAR(d) => if (c==d) ONE else ZERO
    66   case ALT(r1, r2) => ALT(der(c, r1), der(c, r2))
    74   case ALT(a,b) => der(c,a)|der(c,b)
    67   case SEQ(r1, r2) => 
    75   case SEQ(a,b) => if(nullable(a)) {(der(c,a)~b)|der(c,b)}
    68     if (nullable(r1)) ALT(SEQ(der(c, r1), r2), der(c, r2))
    76                    else der(c,a)~b
    69     else SEQ(der(c, r1), r2)
    77   case STAR(a) => der(c,a)~STAR(a)
    70   case STAR(r1) => SEQ(der(c, r1), STAR(r1))
    78 }
    71 }
    79 
    72 
    80 println(der('a', ZERO | ONE))// == (ZERO | ZERO)
       
    81 println(der('a', (CHAR('a') | ONE) ~ CHAR('a')))// ==ALT((ONE | ZERO) ~ CHAR('a'), ONE)
       
    82 println(der('a', STAR(CHAR('a'))))// == (ONE ~ STAR(CHAR('a')))
       
    83 println(der('b', STAR(CHAR('a'))))// == (ZERO ~ STAR(CHAR('a'))))
       
    84 
       
    85 
       
    86 //ALT(SEQ(ZERO,ZERO),ZERO)
       
    87 //ALT(ALT(ZERO,ZERO),ALT(ZERO,ZERO))
       
    88 
       
    89 // * == |
       
    90 // + == ~
    73 // (3) Complete the simp function according to
    91 // (3) Complete the simp function according to
    74 // the specification given in the coursework; this
    92 // the specification given in the coursework; this
    75 // function simplifies a regular expression from
    93 // function simplifies a regular expression from
    76 // the inside out, like you would simplify arithmetic 
    94 // the inside out, like you would simplify arithmetic
    77 // expressions; however it does not simplify inside 
    95 // expressions; however it does not simplify inside
    78 // STAR-regular expressions.
    96 // STAR-regular expressions.
    79 
    97 
    80 def simp(r: Rexp) : Rexp = r match {
    98 /*
    81   case ALT(r1, r2) => (simp(r1), simp(r2)) match {
    99 def simp(r: Rexp) : Rexp = r match{
    82     case (ZERO, r2s) => r2s
   100   case SEQ(ZERO,_) => ZERO
    83     case (r1s, ZERO) => r1s
   101   case SEQ(_,ZERO) => ZERO
    84     case (r1s, r2s) => if (r1s == r2s) r1s else ALT (r1s, r2s)
   102   case SEQ(ONE,a) => simp(a)
    85   }
   103   case SEQ(a,ONE) => simp(a)
    86   case SEQ(r1, r2) =>  (simp(r1), simp(r2)) match {
   104   case ALT(ZERO,a) => simp(a)
    87     case (ZERO, _) => ZERO
   105   case ALT(a,ZERO) => simp(a)
    88     case (_, ZERO) => ZERO
   106   case ALT(a,b) => if(a == b) simp(a) else r
    89     case (ONE, r2s) => r2s
   107   case _ => r
    90     case (r1s, ONE) => r1s
   108 }*/
    91     case (r1s, r2s) => SEQ(r1s, r2s)
   109 
    92   }
   110 def simp(r: Rexp) : Rexp = r match{
    93   case r => r
   111   case SEQ(a,b) =>{ val sa = simp(a)
    94 }
   112                     val sb = simp(b)
    95 
   113                     if(sa == ZERO || sb == ZERO) ZERO
    96 
   114                     else if(sa == ONE) sb
    97 // (4) Complete the two functions below; the first 
   115                     else if(sb == ONE) sa
       
   116                     else SEQ(sa,sb)
       
   117                     }
       
   118   case ALT(a,b) =>{ val sa = simp(a)
       
   119                     val sb = simp(b)
       
   120                     if(sa == ONE || sb == ONE) ONE
       
   121                     else if(sa == ZERO) sb
       
   122                     else if(sb == ZERO) sa
       
   123                     else if(sa == sb) sa
       
   124                     else ALT(sa,sb)
       
   125                     }
       
   126   //case STAR(STAR(a)) => simp(STAR(a))
       
   127   //case STAR(a) => STAR(simp(a))
       
   128   case _ => r
       
   129   /*
       
   130   case SEQ(ZERO,_) => ZERO
       
   131   case SEQ(_,ZERO) => ZERO
       
   132   case SEQ(ONE,a) => simp(a)
       
   133   case SEQ(a,ONE) => simp(a)
       
   134   case SEQ(a,b) => SEQ(simp(a),simp(b))
       
   135   //case ALT(ZERO,a) => simp(a)
       
   136   case ALT(a,ZERO) => simp(a)
       
   137   case ALT(ONE,_) => ONE
       
   138   case ALT(_,ONE) => ONE
       
   139   case ALT(a,b) => {val sa = simp(a)
       
   140                     if(sa == simp(b)) sa else r
       
   141                     }
       
   142   case STAR(STAR(a)) => simp(STAR(a))
       
   143   case STAR(a) => STAR(simp(a))
       
   144   case _ => r*/
       
   145 }
       
   146 
       
   147 
       
   148 /*val EVIL = SEQ(STAR(STAR(CHAR('a'))), CHAR('b'))
       
   149 println("TEST: " + simp(der('a', der('a', EVIL))))
       
   150 println(simp(ONE))
       
   151 val r1 = ALT(ZERO,ONE)
       
   152 val r2 = SEQ(ONE,ZERO)
       
   153 val r3 = SEQ(r1,SEQ(r2,r1))
       
   154 println("R1 = " + simp(r1))
       
   155 println(simp(r2))
       
   156 println(simp(r3))
       
   157 */
       
   158 
       
   159 // (4) Complete the two functions below; the first
    98 // calculates the derivative w.r.t. a string; the second
   160 // calculates the derivative w.r.t. a string; the second
    99 // is the regular expression matcher taking a regular
   161 // is the regular expression matcher taking a regular
   100 // expression and a string and checks whether the
   162 // expression and a string and checks whether the
   101 // string matches the regular expression.
   163 // string matches the regular expression
   102 
   164 
   103 def ders (s: List[Char], r: Rexp) : Rexp = s match {
   165 def ders (s: List[Char], r: Rexp ="") : Rexp = s match{
   104   case Nil => r
   166   case Nil => r
   105   case c::s => ders(s, simp(der(c, r)))
   167   case a::z => ders(z,simp(der(a,r)))
   106 }
   168 }
   107 
   169 
   108 // main matcher function
   170 def matcher(r: Rexp, s: String): Boolean = {
   109 def matcher(r: Rexp, s: String) = nullable(ders(s.toList, r))
   171   val derivatives = simp(ders(s.toList,r))
       
   172   nullable(derivatives)
       
   173 }
   110 
   174 
   111 // (5) Complete the size function for regular
   175 // (5) Complete the size function for regular
   112 // expressions according to the specification 
   176 // expressions according to the specification
   113 // given in the coursework.
   177 // given in the coursework.
   114 
   178 
   115 
   179 def size(r: Rexp): Int = r match{
   116 def size(r: Rexp): Int = r match {
       
   117   case ZERO => 1
   180   case ZERO => 1
   118   case ONE => 1
   181   case ONE => 1
   119   case CHAR(_) => 1
   182   case CHAR(_) => 1
   120   case ALT(r1, r2) => 1 + size(r1) + size (r2)
   183   case SEQ(a,b) => 1 + size(a) + size(b)
   121   case SEQ(r1, r2) => 1 + size(r1) + size (r2)
   184   case ALT(a,b) => 1 + size(a) + size(b)
   122   case STAR(r1) => 1 + size(r1)
   185   case STAR(a) => 1 + size(a)
   123 }
   186 }
   124 
   187 
   125 
   188 println(der('a', ZERO | ONE))// == (ZERO | ZERO)
   126 
   189 println(der('a', (CHAR('a') | ONE) ~ CHAR('a')))// ==ALT((ONE | ZERO) ~ CHAR('a'), ONE)
       
   190 println(der('a', STAR(CHAR('a'))))// == (ONE ~ STAR(CHAR('a')))
       
   191 println(der('b', STAR(CHAR('a'))))// == (ZERO ~ STAR(CHAR('a'))))
   127 // some testing data
   192 // some testing data
   128 /*
   193 /*
   129 matcher(("a" ~ "b") ~ "c", "abc")  // => true
   194 
   130 matcher(("a" ~ "b") ~ "c", "ab")   // => false
   195 assert(matcher(("a" ~ "b") ~ "c", "abc") == true)  // => true
       
   196 assert(matcher(("a" ~ "b") ~ "c", "ab") == false)   // => false
       
   197 
   131 
   198 
   132 // the supposedly 'evil' regular expression (a*)* b
   199 // the supposedly 'evil' regular expression (a*)* b
   133 val EVIL = SEQ(STAR(STAR(CHAR('a'))), CHAR('b'))
   200 //val EVIL = SEQ(STAR(STAR(CHAR('a'))), CHAR('b'))
   134 
   201 
   135 matcher(EVIL, "a" * 1000 ++ "b")   // => true
   202 
   136 matcher(EVIL, "a" * 1000)          // => false
   203 assert(matcher(EVIL, "a" * 1000 ++ "b") == true) // => true
       
   204 assert(matcher(EVIL, "a" * 1000) == false)          // => false
   137 
   205 
   138 // size without simplifications
   206 // size without simplifications
   139 size(der('a', der('a', EVIL)))             // => 28
   207 assert("28 " + size(der('a', der('a', EVIL)))             ==28)// => 28
   140 size(der('a', der('a', der('a', EVIL))))   // => 58
   208 assert("58 " + size(der('a', der('a', der('a', EVIL))))   ==58)// => 58
   141 
   209 
   142 // size with simplification
   210 // size with simplification
   143 size(simp(der('a', der('a', EVIL))))           // => 8
   211 assert("8 " + size(simp(der('a', der('a', EVIL))))           ==8)// => 8
   144 size(simp(der('a', der('a', der('a', EVIL))))) // => 8
   212 assert("8 " + size(simp(der('a', der('a', der('a', EVIL))))) ==8) // => 8
   145 
   213 
   146 // Python needs around 30 seconds for matching 28 a's with EVIL. 
   214 */
       
   215 
       
   216 
       
   217 /*
       
   218 // Python needs around 30 seconds for matching 28 a's with EVIL.
   147 // Java 9 and later increase this to an "astonishing" 40000 a's in
   219 // Java 9 and later increase this to an "astonishing" 40000 a's in
   148 // around 30 seconds.
   220 // 30 seconds.
   149 //
   221 //
   150 // Lets see how long it takes to match strings with 
   222 // Lets see how long it really takes to match strings with
   151 // 5 Million a's...it should be in the range of a 
   223 // 5 Million a's...it should be in the range of a couple
   152 // couple of seconds.
   224 // of seconds.
   153 
   225 
   154 def time_needed[T](i: Int, code: => T) = {
   226 def time_needed[T](i: Int, code: => T) = {
   155   val start = System.nanoTime()
   227   val start = System.nanoTime()
   156   for (j <- 1 to i) code
   228   for (j <- 1 to i) code
   157   val end = System.nanoTime()
   229   val end = System.nanoTime()
   160 
   232 
   161 for (i <- 0 to 5000000 by 500000) {
   233 for (i <- 0 to 5000000 by 500000) {
   162   println(i + " " + "%.5f".format(time_needed(2, matcher(EVIL, "a" * i))))
   234   println(i + " " + "%.5f".format(time_needed(2, matcher(EVIL, "a" * i))))
   163 }
   235 }
   164 
   236 
   165 // another "power" test case 
   237 // another "power" test case
   166 simp(Iterator.iterate(ONE:Rexp)(r => SEQ(r, ONE | ONE)).drop(100).next) == ONE
   238 simp(Iterator.iterate(ONE:Rexp)(r => SEQ(r, ONE | ONE)).drop(50).next) == ONE
   167 
   239 
   168 // the Iterator produces the rexp
   240 // the Iterator produces the rexp
   169 //
   241 //
   170 //      SEQ(SEQ(SEQ(..., ONE | ONE) , ONE | ONE), ONE | ONE)
   242 //      SEQ(SEQ(SEQ(..., ONE | ONE) , ONE | ONE), ONE | ONE)
   171 //
   243 //
   172 //    where SEQ is nested 100 times.
   244 //    where SEQ is nested 50 times.
   173  
   245 
   174 */
   246 */
   175 
       
   176 //}