testing4/re.scala
changeset 229 5549016ab10f
parent 228 33c2655be47d
child 236 e461b5325b5e
equal deleted inserted replaced
228:33c2655be47d 229:5549016ab10f
     1 // Part 1 about Regular Expression Matching
     1 // Part 1 about Regular Expression Matching
     2 //==========================================
     2 //==========================================
       
     3 
       
     4 //object CW9a {
     3 
     5 
     4 // Regular Expressions
     6 // Regular Expressions
     5 abstract class Rexp
     7 abstract class Rexp
     6 case object ZERO extends Rexp
     8 case object ZERO extends Rexp
     7 case object ONE extends Rexp
     9 case object ONE extends Rexp
     8 case class CHAR(c: Char) extends Rexp
    10 case class CHAR(c: Char) extends Rexp
     9 case class ALT(r1: Rexp, r2: Rexp) extends Rexp   // alternative
    11 case class ALT(r1: Rexp, r2: Rexp) extends Rexp 
    10 case class SEQ(r1: Rexp, r2: Rexp) extends Rexp   // sequence
    12 case class SEQ(r1: Rexp, r2: Rexp) extends Rexp 
    11 case class STAR(r: Rexp) extends Rexp             // star
    13 case class STAR(r: Rexp) extends Rexp 
    12 
    14 
       
    15 // some convenience for typing in regular expressions
    13 
    16 
    14 // some convenience for typing regular expressions
    17 import scala.language.implicitConversions    
       
    18 import scala.language.reflectiveCalls 
    15 
    19 
    16 import scala.language.implicitConversions
       
    17 import scala.language.reflectiveCalls
       
    18 
    20 
    19 def charlist2rexp(s: List[Char]): Rexp = s match {
    21 def charlist2rexp(s: List[Char]): Rexp = s match {
    20   case Nil => ONE
    22   case Nil => ONE
    21   case c::Nil => CHAR(c)
    23   case c::Nil => CHAR(c)
    22   case c::s => SEQ(CHAR(c), charlist2rexp(s))
    24   case c::s => SEQ(CHAR(c), charlist2rexp(s))
    36   def ~ (r: Rexp) = SEQ(s, r)
    38   def ~ (r: Rexp) = SEQ(s, r)
    37   def ~ (r: String) = SEQ(s, r)
    39   def ~ (r: String) = SEQ(s, r)
    38 }
    40 }
    39 
    41 
    40 // (1) Complete the function nullable according to
    42 // (1) Complete the function nullable according to
    41 // the definition given in the coursework; this
    43 // the definition given in the coursework; this 
    42 // function checks whether a regular expression
    44 // function checks whether a regular expression
    43 // can match the empty string and Returns a boolean
    45 // can match the empty string and Returns a boolean
    44 // accordingly.
    46 // accordingly.
    45 
    47 
    46 def nullable (r: Rexp) : Boolean = r match{
    48 def nullable (r: Rexp) : Boolean = r match {
    47   case ZERO => false
    49   case ZERO => false
    48   case ONE => true
    50   case ONE => true
    49   case CHAR(_) => false
    51   case CHAR(_) => false
    50   case ALT(a,b)=>nullable(a)||nullable(b)
    52   case ALT(r1, r2) => nullable(r1) || nullable(r2)
    51   case SEQ(a,b) => nullable(a) && nullable(b)
    53   case SEQ(r1, r2) => nullable(r1) && nullable(r2)
    52   case STAR(_) => true
    54   case STAR(_) => true
    53 }
    55 }
    54 
    56 
    55 
       
    56 /*val rex = "1~0.%|11"
       
    57 
       
    58 assert(der('1',rex) == SEQ(ONE,SEQ(CHAR(~),SEQ(CHAR(0),SEQ(CHAR(.),SEQ(CHAR(%),SEQ(CHAR(|),SEQ(CHAR(1),CHAR(1)))))))))
       
    59 
       
    60 assert(der('1',der('1',rex)) ==
       
    61         ALT(SEQ(ZERO,SEQ(CHAR(~),SEQ(CHAR(0),SEQ(CHAR(.),SEQ(CHAR(%),SEQ(CHAR(|),
       
    62         SEQ(CHAR(1),CHAR(1)))))))),SEQ(ZERO,SEQ(CHAR(0),SEQ(CHAR(.),SEQ(CHAR(%),
       
    63         SEQ(CHAR(|),SEQ(CHAR(1),CHAR(1))))))))*/
       
    64 
       
    65 // (2) Complete the function der according to
    57 // (2) Complete the function der according to
    66 // the definition given in the coursework; this
    58 // the definition given in the coursework; this
    67 // function calculates the derivative of a
    59 // function calculates the derivative of a 
    68 // regular expression w.r.t. a character.
    60 // regular expression w.r.t. a character.
    69 
    61 
    70 def der (c: Char, r: Rexp) : Rexp = r match{
    62 def der (c: Char, r: Rexp) : Rexp = r match {
    71   case ZERO => ZERO
    63   case ZERO => ZERO
    72   case ONE => ZERO
    64   case ONE => ZERO
    73   case CHAR(d) => if (c==d) ONE else ZERO
    65   case CHAR(d) => if (c == d) ONE else ZERO
    74   case ALT(a,b) => der(c,a)|der(c,b)
    66   case ALT(r1, r2) => ALT(der(c, r1), der(c, r2))
    75   case SEQ(a,b) => if(nullable(a)) {(der(c,a)~b)|der(c,b)}
    67   case SEQ(r1, r2) => 
    76                    else der(c,a)~b
    68     if (nullable(r1)) ALT(SEQ(der(c, r1), r2), der(c, r2))
    77   case STAR(a) => der(c,a)~STAR(a)
    69     else SEQ(der(c, r1), r2)
       
    70   case STAR(r1) => SEQ(der(c, r1), STAR(r1))
    78 }
    71 }
    79 
    72 
    80 println(der('a', ZERO | ONE))// == (ZERO | ZERO)
       
    81 println(der('a', (CHAR('a') | ONE) ~ CHAR('a')))// ==ALT((ONE | ZERO) ~ CHAR('a'), ONE)
       
    82 println(der('a', STAR(CHAR('a'))))// == (ONE ~ STAR(CHAR('a')))
       
    83 println(der('b', STAR(CHAR('a'))))// == (ZERO ~ STAR(CHAR('a'))))
       
    84 
       
    85 
       
    86 //ALT(SEQ(ZERO,ZERO),ZERO)
       
    87 //ALT(ALT(ZERO,ZERO),ALT(ZERO,ZERO))
       
    88 
       
    89 // * == |
       
    90 // + == ~
       
    91 // (3) Complete the simp function according to
    73 // (3) Complete the simp function according to
    92 // the specification given in the coursework; this
    74 // the specification given in the coursework; this
    93 // function simplifies a regular expression from
    75 // function simplifies a regular expression from
    94 // the inside out, like you would simplify arithmetic
    76 // the inside out, like you would simplify arithmetic 
    95 // expressions; however it does not simplify inside
    77 // expressions; however it does not simplify inside 
    96 // STAR-regular expressions.
    78 // STAR-regular expressions.
    97 
    79 
    98 /*
    80 def simp(r: Rexp) : Rexp = r match {
    99 def simp(r: Rexp) : Rexp = r match{
    81   case ALT(r1, r2) => (simp(r1), simp(r2)) match {
   100   case SEQ(ZERO,_) => ZERO
    82     case (ZERO, r2s) => r2s
   101   case SEQ(_,ZERO) => ZERO
    83     case (r1s, ZERO) => r1s
   102   case SEQ(ONE,a) => simp(a)
    84     case (r1s, r2s) => if (r1s == r2s) r1s else ALT (r1s, r2s)
   103   case SEQ(a,ONE) => simp(a)
    85   }
   104   case ALT(ZERO,a) => simp(a)
    86   case SEQ(r1, r2) =>  (simp(r1), simp(r2)) match {
   105   case ALT(a,ZERO) => simp(a)
    87     case (ZERO, _) => ZERO
   106   case ALT(a,b) => if(a == b) simp(a) else r
    88     case (_, ZERO) => ZERO
   107   case _ => r
    89     case (ONE, r2s) => r2s
   108 }*/
    90     case (r1s, ONE) => r1s
   109 
    91     case (r1s, r2s) => SEQ(r1s, r2s)
   110 def simp(r: Rexp) : Rexp = r match{
    92   }
   111   case SEQ(a,b) =>{ val sa = simp(a)
    93   case r => r
   112                     val sb = simp(b)
       
   113                     if(sa == ZERO || sb == ZERO) ZERO
       
   114                     else if(sa == ONE) sb
       
   115                     else if(sb == ONE) sa
       
   116                     else SEQ(sa,sb)
       
   117                     }
       
   118   case ALT(a,b) =>{ val sa = simp(a)
       
   119                     val sb = simp(b)
       
   120                     if(sa == ONE || sb == ONE) ONE
       
   121                     else if(sa == ZERO) sb
       
   122                     else if(sb == ZERO) sa
       
   123                     else if(sa == sb) sa
       
   124                     else ALT(sa,sb)
       
   125                     }
       
   126   //case STAR(STAR(a)) => simp(STAR(a))
       
   127   //case STAR(a) => STAR(simp(a))
       
   128   case _ => r
       
   129   /*
       
   130   case SEQ(ZERO,_) => ZERO
       
   131   case SEQ(_,ZERO) => ZERO
       
   132   case SEQ(ONE,a) => simp(a)
       
   133   case SEQ(a,ONE) => simp(a)
       
   134   case SEQ(a,b) => SEQ(simp(a),simp(b))
       
   135   //case ALT(ZERO,a) => simp(a)
       
   136   case ALT(a,ZERO) => simp(a)
       
   137   case ALT(ONE,_) => ONE
       
   138   case ALT(_,ONE) => ONE
       
   139   case ALT(a,b) => {val sa = simp(a)
       
   140                     if(sa == simp(b)) sa else r
       
   141                     }
       
   142   case STAR(STAR(a)) => simp(STAR(a))
       
   143   case STAR(a) => STAR(simp(a))
       
   144   case _ => r*/
       
   145 }
    94 }
   146 
    95 
   147 
    96 
   148 /*val EVIL = SEQ(STAR(STAR(CHAR('a'))), CHAR('b'))
    97 // (4) Complete the two functions below; the first 
   149 println("TEST: " + simp(der('a', der('a', EVIL))))
       
   150 println(simp(ONE))
       
   151 val r1 = ALT(ZERO,ONE)
       
   152 val r2 = SEQ(ONE,ZERO)
       
   153 val r3 = SEQ(r1,SEQ(r2,r1))
       
   154 println("R1 = " + simp(r1))
       
   155 println(simp(r2))
       
   156 println(simp(r3))
       
   157 */
       
   158 
       
   159 // (4) Complete the two functions below; the first
       
   160 // calculates the derivative w.r.t. a string; the second
    98 // calculates the derivative w.r.t. a string; the second
   161 // is the regular expression matcher taking a regular
    99 // is the regular expression matcher taking a regular
   162 // expression and a string and checks whether the
   100 // expression and a string and checks whether the
   163 // string matches the regular expression
   101 // string matches the regular expression.
   164 
   102 
   165 def ders (s: List[Char], r: Rexp ="") : Rexp = s match{
   103 def ders (s: List[Char], r: Rexp) : Rexp = s match {
   166   case Nil => r
   104   case Nil => r
   167   case a::z => ders(z,simp(der(a,r)))
   105   case c::s => ders(s, simp(der(c, r)))
   168 }
   106 }
   169 
   107 
   170 def matcher(r: Rexp, s: String): Boolean = {
   108 // main matcher function
   171   val derivatives = simp(ders(s.toList,r))
   109 def matcher(r: Rexp, s: String) = nullable(ders(s.toList, r))
   172   nullable(derivatives)
       
   173 }
       
   174 
   110 
   175 // (5) Complete the size function for regular
   111 // (5) Complete the size function for regular
   176 // expressions according to the specification
   112 // expressions according to the specification 
   177 // given in the coursework.
   113 // given in the coursework.
   178 
   114 
   179 def size(r: Rexp): Int = r match{
   115 
       
   116 def size(r: Rexp): Int = r match {
   180   case ZERO => 1
   117   case ZERO => 1
   181   case ONE => 1
   118   case ONE => 1
   182   case CHAR(_) => 1
   119   case CHAR(_) => 1
   183   case SEQ(a,b) => 1 + size(a) + size(b)
   120   case ALT(r1, r2) => 1 + size(r1) + size (r2)
   184   case ALT(a,b) => 1 + size(a) + size(b)
   121   case SEQ(r1, r2) => 1 + size(r1) + size (r2)
   185   case STAR(a) => 1 + size(a)
   122   case STAR(r1) => 1 + size(r1)
   186 }
   123 }
   187 
   124 
   188 println(der('a', ZERO | ONE))// == (ZERO | ZERO)
   125 
   189 println(der('a', (CHAR('a') | ONE) ~ CHAR('a')))// ==ALT((ONE | ZERO) ~ CHAR('a'), ONE)
   126 
   190 println(der('a', STAR(CHAR('a'))))// == (ONE ~ STAR(CHAR('a')))
       
   191 println(der('b', STAR(CHAR('a'))))// == (ZERO ~ STAR(CHAR('a'))))
       
   192 // some testing data
   127 // some testing data
   193 /*
   128 /*
   194 
   129 matcher(("a" ~ "b") ~ "c", "abc")  // => true
   195 assert(matcher(("a" ~ "b") ~ "c", "abc") == true)  // => true
   130 matcher(("a" ~ "b") ~ "c", "ab")   // => false
   196 assert(matcher(("a" ~ "b") ~ "c", "ab") == false)   // => false
       
   197 
       
   198 
   131 
   199 // the supposedly 'evil' regular expression (a*)* b
   132 // the supposedly 'evil' regular expression (a*)* b
   200 //val EVIL = SEQ(STAR(STAR(CHAR('a'))), CHAR('b'))
   133 val EVIL = SEQ(STAR(STAR(CHAR('a'))), CHAR('b'))
   201 
   134 
   202 
   135 matcher(EVIL, "a" * 1000 ++ "b")   // => true
   203 assert(matcher(EVIL, "a" * 1000 ++ "b") == true) // => true
   136 matcher(EVIL, "a" * 1000)          // => false
   204 assert(matcher(EVIL, "a" * 1000) == false)          // => false
       
   205 
   137 
   206 // size without simplifications
   138 // size without simplifications
   207 assert("28 " + size(der('a', der('a', EVIL)))             ==28)// => 28
   139 size(der('a', der('a', EVIL)))             // => 28
   208 assert("58 " + size(der('a', der('a', der('a', EVIL))))   ==58)// => 58
   140 size(der('a', der('a', der('a', EVIL))))   // => 58
   209 
   141 
   210 // size with simplification
   142 // size with simplification
   211 assert("8 " + size(simp(der('a', der('a', EVIL))))           ==8)// => 8
   143 size(simp(der('a', der('a', EVIL))))           // => 8
   212 assert("8 " + size(simp(der('a', der('a', der('a', EVIL))))) ==8) // => 8
   144 size(simp(der('a', der('a', der('a', EVIL))))) // => 8
   213 
   145 
   214 */
   146 // Python needs around 30 seconds for matching 28 a's with EVIL. 
   215 
       
   216 
       
   217 /*
       
   218 // Python needs around 30 seconds for matching 28 a's with EVIL.
       
   219 // Java 9 and later increase this to an "astonishing" 40000 a's in
   147 // Java 9 and later increase this to an "astonishing" 40000 a's in
   220 // 30 seconds.
   148 // around 30 seconds.
   221 //
   149 //
   222 // Lets see how long it really takes to match strings with
   150 // Lets see how long it takes to match strings with 
   223 // 5 Million a's...it should be in the range of a couple
   151 // 5 Million a's...it should be in the range of a 
   224 // of seconds.
   152 // couple of seconds.
   225 
   153 
   226 def time_needed[T](i: Int, code: => T) = {
   154 def time_needed[T](i: Int, code: => T) = {
   227   val start = System.nanoTime()
   155   val start = System.nanoTime()
   228   for (j <- 1 to i) code
   156   for (j <- 1 to i) code
   229   val end = System.nanoTime()
   157   val end = System.nanoTime()
   232 
   160 
   233 for (i <- 0 to 5000000 by 500000) {
   161 for (i <- 0 to 5000000 by 500000) {
   234   println(i + " " + "%.5f".format(time_needed(2, matcher(EVIL, "a" * i))))
   162   println(i + " " + "%.5f".format(time_needed(2, matcher(EVIL, "a" * i))))
   235 }
   163 }
   236 
   164 
   237 // another "power" test case
   165 // another "power" test case 
   238 simp(Iterator.iterate(ONE:Rexp)(r => SEQ(r, ONE | ONE)).drop(50).next) == ONE
   166 simp(Iterator.iterate(ONE:Rexp)(r => SEQ(r, ONE | ONE)).drop(100).next) == ONE
   239 
   167 
   240 // the Iterator produces the rexp
   168 // the Iterator produces the rexp
   241 //
   169 //
   242 //      SEQ(SEQ(SEQ(..., ONE | ONE) , ONE | ONE), ONE | ONE)
   170 //      SEQ(SEQ(SEQ(..., ONE | ONE) , ONE | ONE), ONE | ONE)
   243 //
   171 //
   244 //    where SEQ is nested 50 times.
   172 //    where SEQ is nested 100 times.
       
   173  
       
   174 */
   245 
   175 
   246 */
   176 //}