| author | Christian Urban <christian.urban@kcl.ac.uk> | 
| Mon, 21 Jul 2025 16:38:07 +0100 | |
| changeset 491 | 2a30c7dfe3ed | 
| parent 474 | 8a61bcd51ec3 | 
| permissions | -rw-r--r-- | 
| 396 | 1 | // Main Part 3 about Regular Expression Matching | 
| 425 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 2 | //============================================== | 
| 288 | 3 | |
| 396 | 4 | object M3 {
 | 
| 219 | 5 | |
| 6 | abstract class Rexp | |
| 7 | case object ZERO extends Rexp | |
| 8 | case object ONE extends Rexp | |
| 9 | case class CHAR(c: Char) extends Rexp | |
| 425 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 10 | case class ALTs(rs: List[Rexp]) extends Rexp // alternatives | 
| 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 11 | case class SEQs(rs: List[Rexp]) extends Rexp // sequences | 
| 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 12 | case class STAR(r: Rexp) extends Rexp // star | 
| 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 13 | |
| 219 | 14 | |
| 425 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 15 | //the usual binary choice and binary sequence can be defined | 
| 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 16 | //in terms of ALTs and SEQs | 
| 396 | 17 | def ALT(r1: Rexp, r2: Rexp) = ALTs(List(r1, r2)) | 
| 425 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 18 | def SEQ(r1: Rexp, r2: Rexp) = SEQs(List(r1, r2)) | 
| 396 | 19 | |
| 421 | 20 | // some convenience for typing regular expressions | 
| 219 | 21 | |
| 22 | def charlist2rexp(s: List[Char]): Rexp = s match {
 | |
| 23 | case Nil => ONE | |
| 24 | case c::Nil => CHAR(c) | |
| 25 | case c::s => SEQ(CHAR(c), charlist2rexp(s)) | |
| 26 | } | |
| 474 | 27 | |
| 28 | import scala.language.implicitConversions | |
| 219 | 29 | |
| 474 | 30 | given Conversion[String, Rexp] = (s => charlist2rexp(s.toList)) | 
| 31 | ||
| 32 | extension (r: Rexp) {
 | |
| 219 | 33 | def | (s: Rexp) = ALT(r, s) | 
| 34 | def % = STAR(r) | |
| 35 | def ~ (s: Rexp) = SEQ(r, s) | |
| 36 | } | |
| 37 | ||
| 474 | 38 | // some examples for the conversion and extension: | 
| 219 | 39 | |
| 425 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 40 | // val areg : Rexp = "a" | "b" | 
| 474 | 41 | //  => ALTs(List(CHAR('a'), CHAR('b')))
 | 
| 42 | // | |
| 425 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 43 | // val sreg : Rexp = "a" ~ "b" | 
| 474 | 44 | //  => SEQs(List(CHAR('a'), CHAR('b'))) 
 | 
| 45 | // | |
| 46 | // val star_reg : Rexp = ("a" ~ "b").%
 | |
| 47 | //  => STAR(SEQs(List(CHAR('a'), CHAR('b')))) 
 | |
| 219 | 48 | |
| 425 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 49 | // ADD YOUR CODE BELOW | 
| 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 50 | //====================== | 
| 396 | 51 | |
| 425 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 52 | // (1) | 
| 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 53 | def nullable (r: Rexp) : Boolean = ??? | 
| 396 | 54 | |
| 425 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 55 | // (2) | 
| 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 56 | def der (c: Char, r: Rexp) : Rexp = ??? | 
| 396 | 57 | |
| 425 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 58 | // (3) | 
| 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 59 | def denest(rs: List[Rexp]) : List[Rexp] = ??? | 
| 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 60 | |
| 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 61 | // (4) | 
| 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 62 | def flts(rs: List[Rexp], acc: List[Rexp] = Nil) : List[Rexp] = ??? | 
| 219 | 63 | |
| 425 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 64 | // (5) | 
| 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 65 | def ALTs_smart(rs: List[Rexp]) : Rexp = ??? | 
| 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 66 | def SEQs_smart(rs: List[Rexp]) : Rexp = ??? | 
| 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 67 | |
| 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 68 | // (6) | 
| 347 | 69 | def simp(r: Rexp) : Rexp = ??? | 
| 219 | 70 | |
| 425 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 71 | // (7) | 
| 347 | 72 | def ders (s: List[Char], r: Rexp) : Rexp = ??? | 
| 73 | def matcher(r: Rexp, s: String): Boolean = ??? | |
| 219 | 74 | |
| 425 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 75 | // (8) | 
| 347 | 76 | def size(r: Rexp): Int = ??? | 
| 219 | 77 | |
| 78 | ||
| 425 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 79 | // Some testing data | 
| 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 80 | //=================== | 
| 219 | 81 | /* | 
| 425 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 82 | |
| 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 83 | simp(ALT(ONE | CHAR('a'), CHAR('a') | ONE))   // => ALTs(List(ONE, CHAR(a)))
 | 
| 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 84 | simp(((CHAR('a') | ZERO) ~ ONE) | 
 | 
| 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 85 |      (((ONE | CHAR('b')) | CHAR('c')) ~ (CHAR('d') ~ ZERO)))   // => CHAR(a)
 | 
| 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 86 | |
| 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 87 | matcher(("a" ~ "b") ~ "c", "ab")   // => false
 | 
| 219 | 88 | matcher(("a" ~ "b") ~ "c", "abc")  // => true
 | 
| 425 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 89 | |
| 219 | 90 | |
| 91 | // the supposedly 'evil' regular expression (a*)* b | |
| 92 | val EVIL = SEQ(STAR(STAR(CHAR('a'))), CHAR('b'))
 | |
| 93 | ||
| 425 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 94 | matcher(EVIL, "a" * 1000) // => false | 
| 219 | 95 | matcher(EVIL, "a" * 1000 ++ "b") // => true | 
| 425 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 96 | |
| 219 | 97 | |
| 98 | // size without simplifications | |
| 425 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 99 | size(der('a', der('a', EVIL)))             // => 36
 | 
| 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 100 | size(der('a', der('a', der('a', EVIL))))   // => 83
 | 
| 219 | 101 | |
| 102 | // size with simplification | |
| 425 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 103 | size(simp(der('a', der('a', EVIL))))           // => 7
 | 
| 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 104 | size(simp(der('a', der('a', der('a', EVIL))))) // => 7
 | 
| 219 | 105 | |
| 220 | 106 | // Python needs around 30 seconds for matching 28 a's with EVIL. | 
| 107 | // Java 9 and later increase this to an "astonishing" 40000 a's in | |
| 108 | // 30 seconds. | |
| 219 | 109 | // | 
| 110 | // Lets see how long it really takes to match strings with | |
| 425 
6e990ae2c6a3
updated solutions and templates
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
421diff
changeset | 111 | // 5 Million a's...it should be in the range of a few | 
| 220 | 112 | // of seconds. | 
| 219 | 113 | |
| 114 | def time_needed[T](i: Int, code: => T) = {
 | |
| 115 | val start = System.nanoTime() | |
| 116 | for (j <- 1 to i) code | |
| 117 | val end = System.nanoTime() | |
| 400 | 118 | "%.5f".format((end - start)/(i * 1.0e9)) | 
| 219 | 119 | } | 
| 120 | ||
| 121 | for (i <- 0 to 5000000 by 500000) {
 | |
| 400 | 122 |   println(s"$i ${time_needed(2, matcher(EVIL, "a" * i))} secs.") 
 | 
| 219 | 123 | } | 
| 124 | ||
| 220 | 125 | // another "power" test case | 
| 400 | 126 | simp(Iterator.iterate(ONE:Rexp)(r => SEQ(r, ONE | ONE)).drop(50).next()) == ONE | 
| 220 | 127 | |
| 128 | // the Iterator produces the rexp | |
| 129 | // | |
| 130 | // SEQ(SEQ(SEQ(..., ONE | ONE) , ONE | ONE), ONE | ONE) | |
| 131 | // | |
| 132 | // where SEQ is nested 50 times. | |
| 133 | ||
| 219 | 134 | */ | 
| 135 | ||
| 288 | 136 | } |