| author | Christian Urban <christian.urban@kcl.ac.uk> | 
| Mon, 20 Oct 2025 22:18:21 +0200 | |
| changeset 1013 | 1a23d87d1700 | 
| parent 974 | 0cb4bf2469d1 | 
| permissions | -rw-r--r-- | 
| 732 | 1 | // Parser Combinators: Simple Version | 
| 2 | //==================================== | |
| 742 | 3 | // | 
| 4 | // Call with | |
| 5 | // | |
| 6 | // amm comb1.sc | |
| 732 | 7 | |
| 960 | 8 | |
| 906 | 9 | // Note, in the lectures I did not show the type bound | 
| 961 | 10 | // I : IsSeq, which means that the input type 'I' needs | 
| 11 | // to be a sequence that can be tested to be empty. | |
| 12 | ||
| 13 | trait IsSeq[I] {
 | |
| 14 | extension (i: I) def isEmpty: Boolean | |
| 15 | } | |
| 732 | 16 | |
| 961 | 17 | given IsSeq[String] = _.isEmpty | 
| 18 | given [I]: IsSeq[Seq[I]] = _.isEmpty | |
| 19 | ||
| 941 | 20 | |
| 961 | 21 | // parser class | 
| 22 | //============== | |
| 23 | ||
| 24 | abstract class Parser[I : IsSeq, T]  {
 | |
| 960 | 25 | def parse(in: I): Set[(T, I)] | 
| 732 | 26 | |
| 27 | def parse_all(in: I) : Set[T] = | |
| 960 | 28 | for ((hd, tl) <- parse(in); | 
| 961 | 29 | if tl.isEmpty) yield hd | 
| 732 | 30 | } | 
| 31 | ||
| 32 | // parser combinators | |
| 961 | 33 | //==================== | 
| 941 | 34 | |
| 799 | 35 | // alternative parser | 
| 960 | 36 | class AltParser[I : IsSeq, T](p: => Parser[I, T], | 
| 941 | 37 |                               q: => Parser[I, T]) extends Parser[I, T] {
 | 
| 960 | 38 | def parse(in: I) = p.parse(in) ++ q.parse(in) | 
| 799 | 39 | } | 
| 40 | ||
| 732 | 41 | // sequence parser | 
| 960 | 42 | class SeqParser[I: IsSeq, T, S](p: => Parser[I, T], | 
| 941 | 43 |                                 q: => Parser[I, S]) extends Parser[I, (T, S)] {
 | 
| 960 | 44 | def parse(in: I) = | 
| 45 | for ((hd1, tl1) <- p.parse(in); | |
| 732 | 46 | (hd2, tl2) <- q.parse(tl1)) yield ((hd1, hd2), tl2) | 
| 47 | } | |
| 48 | ||
| 742 | 49 | // map parser | 
| 960 | 50 | class MapParser[I : IsSeq, T, S](p: => Parser[I, T], | 
| 954 | 51 |                                  f: T => S) extends Parser[I, S] {
 | 
| 732 | 52 | def parse(in: I) = for ((hd, tl) <- p.parse(in)) yield (f(hd), tl) | 
| 53 | } | |
| 54 | ||
| 742 | 55 | |
| 56 | ||
| 732 | 57 | // an example of an atomic parser for characters | 
| 58 | case class CharParser(c: Char) extends Parser[String, Char] {
 | |
| 960 | 59 | def parse(in: String) = | 
| 732 | 60 | if (in != "" && in.head == c) Set((c, in.tail)) else Set() | 
| 61 | } | |
| 62 | ||
| 972 | 63 | |
| 64 | ||
| 954 | 65 | val ap = CharParser('a')
 | 
| 66 | val bp = CharParser('b')
 | |
| 67 | ||
| 972 | 68 | print(ap.parse("aade"))
 | 
| 69 | ||
| 954 | 70 | val abp = SeqParser(ap, bp) | 
| 972 | 71 | print(abp.parse("abade"))
 | 
| 72 | ||
| 73 | val abp = AltParser(ap, bp) | |
| 74 | print(abp.parse("abc"))
 | |
| 75 | ||
| 954 | 76 | MapParser(abp, ab => s"$ab").parse("abc")
 | 
| 799 | 77 | |
| 732 | 78 | // an atomic parser for parsing strings according to a regex | 
| 79 | import scala.util.matching.Regex | |
| 80 | ||
| 81 | case class RegexParser(reg: Regex) extends Parser[String, String] {
 | |
| 82 |   def parse(in: String) = reg.findPrefixMatchOf(in) match {
 | |
| 83 | case None => Set() | |
| 960 | 84 | case Some(m) => Set((m.matched, m.after.toString)) | 
| 732 | 85 | } | 
| 86 | } | |
| 87 | ||
| 960 | 88 | // atomic parsers for numbers and "verbatim" strings | 
| 732 | 89 | val NumParser = RegexParser("[0-9]+".r)
 | 
| 972 | 90 | NumParser.parse("123abc345")
 | 
| 91 | ||
| 732 | 92 | def StrParser(s: String) = RegexParser(Regex.quote(s).r) | 
| 93 | ||
| 960 | 94 | NumParser.parse("123a123bc")
 | 
| 906 | 95 | StrParser("else").parse("elsethen")
 | 
| 742 | 96 | |
| 799 | 97 | |
| 732 | 98 | // NumParserInt transforms a "string integer" into a propper Int | 
| 99 | // (needs "new" because MapParser is not a case class) | |
| 100 | ||
| 954 | 101 | val NumParserInt = MapParser(NumParser, (s: String) => s.toInt) | 
| 897 | 102 | NumParserInt.parse("123abc")
 | 
| 732 | 103 | |
| 960 | 104 | // the following string interpolation allows us to write | 
| 105 | // StrParser(_some_string_) more conveniently as | |
| 732 | 106 | // | 
| 960 | 107 | // p"<_some_string_>" | 
| 732 | 108 | |
| 972 | 109 | |
| 110 | ||
| 960 | 111 | extension (sc: StringContext) | 
| 112 | def p(args: Any*) = StrParser(sc.s(args*)) | |
| 897 | 113 | |
| 114 | ||
| 960 | 115 | (p"else").parse("elsethen")
 | 
| 799 | 116 | |
| 732 | 117 | // more convenient syntax for parser combinators | 
| 941 | 118 | extension [I: IsSeq, T](p: Parser[I, T]) {
 | 
| 732 | 119 | def ||(q : => Parser[I, T]) = new AltParser[I, T](p, q) | 
| 120 | def ~[S] (q : => Parser[I, S]) = new SeqParser[I, T, S](p, q) | |
| 919 | 121 | def map[S](f: => T => S) = new MapParser[I, T, S](p, f) | 
| 732 | 122 | } | 
| 123 | ||
| 960 | 124 | // simple example of transforming the | 
| 948 | 125 | // result into capital letters | 
| 897 | 126 | def toU(s: String) = s.map(_.toUpper) | 
| 127 | ||
| 960 | 128 | (p"else").map(toU(_)).parse("elseifthen")
 | 
| 897 | 129 | |
| 732 | 130 | // these implicits allow us to use an infix notation for | 
| 131 | // sequences and alternatives; we also can write the usual | |
| 132 | // map for a MapParser | |
| 133 | ||
| 134 | ||
| 135 | // with this NumParserInt can now be written more conveniently | |
| 136 | // as: | |
| 137 | ||
| 799 | 138 | val NumParserInt2 = NumParser.map(_.toInt) | 
| 732 | 139 | |
| 972 | 140 | |
| 732 | 141 | |
| 142 | // A parser for palindromes (just returns them as string) | |
| 948 | 143 | // since the parser is recursive it needs to be lazy | 
| 803 | 144 | lazy val Pal : Parser[String, String] = {
 | 
| 960 | 145 |    (p"a" ~ Pal ~ p"a").map{ case ((x, y), z) => s"$x$y$z" } ||
 | 
| 146 |    (p"b" ~ Pal ~ p"b").map{ case ((x, y), z) => s"$x$y$z" } ||
 | |
| 896 | 147 | p"a" || p"b" || p"" | 
| 960 | 148 | } | 
| 732 | 149 | |
| 150 | // examples | |
| 972 | 151 | Pal.parse("abaaba")
 | 
| 954 | 152 | Pal.parse("abacaaba")
 | 
| 732 | 153 | |
| 154 | println("Palindrome: " + Pal.parse_all("abaaaba"))
 | |
| 155 | ||
| 960 | 156 | // A parser for wellnested parentheses | 
| 799 | 157 | // | 
| 158 | // P ::= ( P ) P | epsilon | |
| 159 | // | |
| 160 | //   (transforms '(' -> '{' , ')' -> '}' )
 | |
| 161 | lazy val P : Parser[String, String] = {
 | |
| 919 | 162 |   (p"(" ~ P ~ p")" ~ P).map{ case (((_, x), _), y) => "{" + x + "}" + y } ||
 | 
| 799 | 163 | p"" | 
| 960 | 164 | } | 
| 732 | 165 | |
| 166 | println(P.parse_all("(((()()))())"))
 | |
| 167 | println(P.parse_all("(((()()))()))"))
 | |
| 168 | println(P.parse_all(")("))
 | |
| 169 | println(P.parse_all("()"))
 | |
| 170 | ||
| 171 | // A parser for arithmetic expressions (Terms and Factors) | |
| 898 | 172 | |
| 972 | 173 | |
| 799 | 174 | lazy val E: Parser[String, Int] = {
 | 
| 732 | 175 |   (T ~ p"+" ~ E).map{ case ((x, _), z) => x + z } ||
 | 
| 799 | 176 |   (T ~ p"-" ~ E).map{ case ((x, _), z) => x - z } || T }
 | 
| 177 | lazy val T: Parser[String, Int] = {
 | |
| 178 |   (F ~ p"*" ~ T).map{ case ((x, _), z) => x * z } || F }
 | |
| 179 | lazy val F: Parser[String, Int] = {
 | |
| 180 |   (p"(" ~ E ~ p")").map{ case ((_, y), _) => y } || NumParserInt }
 | |
| 956 | 181 | |
| 974 | 182 | println(E.parse_all("2 * 2 * 2"))
 | 
| 732 | 183 | println(E.parse_all("1+3+4"))
 | 
| 184 | println(E.parse("1+3+4"))
 | |
| 185 | println(E.parse_all("4*2+3"))
 | |
| 186 | println(E.parse_all("4*(2+3)"))
 | |
| 954 | 187 | println(E.parse_all("(4)*(((2+3)))"))
 | 
| 732 | 188 | println(E.parse_all("4/2+3"))
 | 
| 189 | println(E.parse("1 + 2 * 3"))
 | |
| 190 | println(E.parse_all("(1+2)+3"))
 | |
| 799 | 191 | println(E.parse_all("1+2+3"))
 | 
| 732 | 192 | |
| 193 | ||
| 961 | 194 | // with parser combinators (and many other parsing algorithms) | 
| 972 | 195 | // no left-recursion is allowed, otherwise they will loop; | 
| 196 | // newer versions of Scala (3.5+) will actually give a warning | |
| 961 | 197 | // about this | 
| 732 | 198 | |
| 961 | 199 | /* | 
| 960 | 200 | lazy val EL: Parser[String, Int] = | 
| 201 |   ((EL ~ p"+" ~ EL).map{ case ((x, y), z) => x + z} ||
 | |
| 732 | 202 |    (EL ~ p"*" ~ EL).map{ case ((x, y), z) => x * z} ||
 | 
| 203 |    (p"(" ~ EL ~ p")").map{ case ((x, y), z) => y} ||
 | |
| 204 | NumParserInt) | |
| 961 | 205 | */ | 
| 732 | 206 | |
| 207 | // this will run forever: | |
| 208 | //println(EL.parse_all("1+2+3"))
 | |
| 209 | ||
| 210 | ||
| 211 | // non-ambiguous vs ambiguous grammars | |
| 212 | ||
| 213 | // ambiguous | |
| 214 | lazy val S : Parser[String, String] = | |
| 215 |   (p"1" ~ S ~ S).map{ case ((x, y), z) => x + y + z } || p""
 | |
| 216 | ||
| 850 | 217 | //println(time(S.parse("1" * 10)))
 | 
| 218 | //println(time(S.parse_all("1" * 10)))
 | |
| 732 | 219 | |
| 220 | // non-ambiguous | |
| 221 | lazy val U : Parser[String, String] = | |
| 222 |   (p"1" ~ U).map{ case (x, y) => x + y } || p""
 | |
| 223 | ||
| 850 | 224 | //println(time(U.parse("1" * 10)))
 | 
| 225 | //println(time(U.parse_all("1" * 10)))
 | |
| 732 | 226 | println(U.parse("1" * 25))
 | 
| 227 | ||
| 228 | U.parse("11")
 | |
| 229 | U.parse("11111")
 | |
| 230 | U.parse("11011")
 | |
| 231 | ||
| 232 | U.parse_all("1" * 100)
 | |
| 233 | U.parse_all("1" * 100 + "0")
 | |
| 234 | ||
| 235 | // you can see the difference in second example | |
| 236 | //S.parse_all("1" * 100)         // succeeds
 | |
| 237 | //S.parse_all("1" * 100 + "0")   // fails
 | |
| 238 | ||
| 239 | ||
| 240 | // A variant which counts how many 1s are parsed | |
| 241 | lazy val UCount : Parser[String, Int] = | |
| 799 | 242 |   (p"1" ~ UCount).map{ case (_, y) => y + 1 } || p"".map{ _ => 0 }
 | 
| 732 | 243 | |
| 244 | println(UCount.parse("11111"))
 | |
| 245 | println(UCount.parse_all("11111"))
 | |
| 246 | ||
| 247 | // Two single character parsers | |
| 248 | lazy val One : Parser[String, String] = p"a" | |
| 249 | lazy val Two : Parser[String, String] = p"b" | |
| 250 | ||
| 251 | One.parse("a")
 | |
| 252 | One.parse("aaa")
 | |
| 253 | ||
| 254 | // note how the pairs nest to the left with sequence parsers | |
| 255 | (One ~ One).parse("aaa")
 | |
| 256 | (One ~ One ~ One).parse("aaa")
 | |
| 257 | (One ~ One ~ One ~ One).parse("aaaa")
 | |
| 258 | ||
| 259 | (One || Two).parse("aaa")
 | |
| 260 | ||
| 261 | ||
| 262 | ||
| 960 | 263 | // a problem with the arithmetic expression parser: it | 
| 742 | 264 | // gets very slow with deeply nested parentheses | 
| 732 | 265 | |
| 919 | 266 | println("A runtime problem")
 | 
| 732 | 267 | println(E.parse("1"))
 | 
| 268 | println(E.parse("(1)"))
 | |
| 269 | println(E.parse("((1))"))
 | |
| 906 | 270 | println(E.parse("(((1)))"))
 | 
| 271 | println(E.parse("((((1))))"))
 | |
| 919 | 272 | println(E.parse("((((((1))))))"))
 | 
| 273 | println(E.parse("(((((((1)))))))"))
 | |
| 906 | 274 | //println(E.parse("((((((((1))))))))"))
 | 
| 828 | 275 | |
| 276 | ||
| 919 | 277 | // faster because of merge in the +/- case | 
| 906 | 278 | lazy val E2: Parser[String, Int] = {
 | 
| 919 | 279 |   (T2 ~ (p"+" || p"-") ~ E2).map[Int]{ case ((x, y), z) => if (y == "+") x + z else x - z} || T2 }
 | 
| 906 | 280 | lazy val T2: Parser[String, Int] = {
 | 
| 919 | 281 |   (F2 ~ p"*" ~ T2).map[Int]{ case ((x, _), z) => x * z } || F2 }
 | 
| 906 | 282 | lazy val F2: Parser[String, Int] = {
 | 
| 919 | 283 |   (p"(" ~ E2 ~ p")").map[Int]{ case ((_, y), _) => y } || NumParserInt }
 | 
| 828 | 284 | |
| 285 | ||
| 919 | 286 | println("mitigated by merging clauses")
 | 
| 906 | 287 | println(E2.parse("1"))
 | 
| 288 | println(E2.parse("(1)"))
 | |
| 289 | println(E2.parse("((1))"))
 | |
| 290 | println(E2.parse("(((1)))"))
 | |
| 291 | println(E2.parse("((((1))))"))
 | |
| 919 | 292 | println(E2.parse("((((((1))))))"))
 | 
| 293 | println(E2.parse("(((((((1)))))))"))
 | |
| 937 | 294 | println(E2.parse("((((((((1))))))))"))
 | 
| 959 
64ec1884d860
updated and added pascal.while file
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
956diff
changeset | 295 | |
| 
64ec1884d860
updated and added pascal.while file
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
956diff
changeset | 296 | |
| 
64ec1884d860
updated and added pascal.while file
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
956diff
changeset | 297 | |
| 
64ec1884d860
updated and added pascal.while file
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
956diff
changeset | 298 | |
| 
64ec1884d860
updated and added pascal.while file
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
956diff
changeset | 299 | |
| 
64ec1884d860
updated and added pascal.while file
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
956diff
changeset | 300 | /* | 
| 
64ec1884d860
updated and added pascal.while file
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
956diff
changeset | 301 | Try | 
| 
64ec1884d860
updated and added pascal.while file
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
956diff
changeset | 302 | |
| 
64ec1884d860
updated and added pascal.while file
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
956diff
changeset | 303 | 6 / 2 * (2+1) | 
| 
64ec1884d860
updated and added pascal.while file
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
956diff
changeset | 304 | |
| 
64ec1884d860
updated and added pascal.while file
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
956diff
changeset | 305 | */ |