| author | Christian Urban <christian.urban@kcl.ac.uk> | 
| Thu, 19 Sep 2024 15:47:33 +0100 | |
| changeset 960 | 791f4d9f53e1 | 
| parent 959 | 787ef75ec006 | 
| child 971 | b7d97a2a083b | 
| permissions | -rw-r--r-- | 
| 732 | 1 | // Parser Combinators: Simple Version | 
| 2 | //==================================== | |
| 742 | 3 | // | 
| 4 | // Call with | |
| 5 | // | |
| 6 | // amm comb1.sc | |
| 732 | 7 | |
| 959 | 8 | |
| 906 | 9 | // Note, in the lectures I did not show the type bound | 
| 960 | 10 | // I : IsSeq, which means that the input type 'I' needs | 
| 11 | // to be a sequence that can be tested to be empty. | |
| 12 | ||
| 13 | trait IsSeq[I] {
 | |
| 14 | extension (i: I) def isEmpty: Boolean | |
| 15 | } | |
| 732 | 16 | |
| 960 | 17 | given IsSeq[String] = _.isEmpty | 
| 18 | given [I]: IsSeq[Seq[I]] = _.isEmpty | |
| 19 | ||
| 940 | 20 | |
| 960 | 21 | // parser class | 
| 22 | //============== | |
| 23 | ||
| 24 | abstract class Parser[I : IsSeq, T]  {
 | |
| 959 | 25 | def parse(in: I): Set[(T, I)] | 
| 732 | 26 | |
| 27 | def parse_all(in: I) : Set[T] = | |
| 959 | 28 | for ((hd, tl) <- parse(in); | 
| 960 | 29 | if tl.isEmpty) yield hd | 
| 732 | 30 | } | 
| 31 | ||
| 32 | // parser combinators | |
| 960 | 33 | //==================== | 
| 940 | 34 | |
| 799 | 35 | // alternative parser | 
| 959 | 36 | class AltParser[I : IsSeq, T](p: => Parser[I, T], | 
| 940 | 37 |                               q: => Parser[I, T]) extends Parser[I, T] {
 | 
| 959 | 38 | def parse(in: I) = p.parse(in) ++ q.parse(in) | 
| 799 | 39 | } | 
| 40 | ||
| 732 | 41 | // sequence parser | 
| 959 | 42 | class SeqParser[I: IsSeq, T, S](p: => Parser[I, T], | 
| 940 | 43 |                                 q: => Parser[I, S]) extends Parser[I, (T, S)] {
 | 
| 959 | 44 | def parse(in: I) = | 
| 45 | for ((hd1, tl1) <- p.parse(in); | |
| 732 | 46 | (hd2, tl2) <- q.parse(tl1)) yield ((hd1, hd2), tl2) | 
| 47 | } | |
| 48 | ||
| 742 | 49 | // map parser | 
| 959 | 50 | class MapParser[I : IsSeq, T, S](p: => Parser[I, T], | 
| 953 | 51 |                                  f: T => S) extends Parser[I, S] {
 | 
| 732 | 52 | def parse(in: I) = for ((hd, tl) <- p.parse(in)) yield (f(hd), tl) | 
| 53 | } | |
| 54 | ||
| 742 | 55 | |
| 56 | ||
| 732 | 57 | // an example of an atomic parser for characters | 
| 58 | case class CharParser(c: Char) extends Parser[String, Char] {
 | |
| 959 | 59 | def parse(in: String) = | 
| 732 | 60 | if (in != "" && in.head == c) Set((c, in.tail)) else Set() | 
| 61 | } | |
| 62 | ||
| 953 | 63 | val ap = CharParser('a')
 | 
| 64 | val bp = CharParser('b')
 | |
| 65 | ||
| 66 | val abp = SeqParser(ap, bp) | |
| 67 | MapParser(abp, ab => s"$ab").parse("abc")
 | |
| 799 | 68 | |
| 732 | 69 | // an atomic parser for parsing strings according to a regex | 
| 70 | import scala.util.matching.Regex | |
| 71 | ||
| 72 | case class RegexParser(reg: Regex) extends Parser[String, String] {
 | |
| 73 |   def parse(in: String) = reg.findPrefixMatchOf(in) match {
 | |
| 74 | case None => Set() | |
| 959 | 75 | case Some(m) => Set((m.matched, m.after.toString)) | 
| 732 | 76 | } | 
| 77 | } | |
| 78 | ||
| 959 | 79 | // atomic parsers for numbers and "verbatim" strings | 
| 732 | 80 | val NumParser = RegexParser("[0-9]+".r)
 | 
| 81 | def StrParser(s: String) = RegexParser(Regex.quote(s).r) | |
| 82 | ||
| 959 | 83 | NumParser.parse("123a123bc")
 | 
| 906 | 84 | StrParser("else").parse("elsethen")
 | 
| 742 | 85 | |
| 799 | 86 | |
| 732 | 87 | // NumParserInt transforms a "string integer" into a propper Int | 
| 88 | // (needs "new" because MapParser is not a case class) | |
| 89 | ||
| 953 | 90 | val NumParserInt = MapParser(NumParser, (s: String) => s.toInt) | 
| 897 | 91 | NumParserInt.parse("123abc")
 | 
| 732 | 92 | |
| 959 | 93 | // the following string interpolation allows us to write | 
| 94 | // StrParser(_some_string_) more conveniently as | |
| 732 | 95 | // | 
| 959 | 96 | // p"<_some_string_>" | 
| 732 | 97 | |
| 959 | 98 | extension (sc: StringContext) | 
| 99 | def p(args: Any*) = StrParser(sc.s(args*)) | |
| 897 | 100 | |
| 101 | ||
| 959 | 102 | (p"else").parse("elsethen")
 | 
| 799 | 103 | |
| 732 | 104 | // more convenient syntax for parser combinators | 
| 940 | 105 | extension [I: IsSeq, T](p: Parser[I, T]) {
 | 
| 732 | 106 | def ||(q : => Parser[I, T]) = new AltParser[I, T](p, q) | 
| 107 | def ~[S] (q : => Parser[I, S]) = new SeqParser[I, T, S](p, q) | |
| 919 | 108 | def map[S](f: => T => S) = new MapParser[I, T, S](p, f) | 
| 732 | 109 | } | 
| 110 | ||
| 959 | 111 | // simple example of transforming the | 
| 947 | 112 | // result into capital letters | 
| 897 | 113 | def toU(s: String) = s.map(_.toUpper) | 
| 114 | ||
| 959 | 115 | (p"else").map(toU(_)).parse("elseifthen")
 | 
| 897 | 116 | |
| 732 | 117 | // these implicits allow us to use an infix notation for | 
| 118 | // sequences and alternatives; we also can write the usual | |
| 119 | // map for a MapParser | |
| 120 | ||
| 121 | ||
| 122 | // with this NumParserInt can now be written more conveniently | |
| 123 | // as: | |
| 124 | ||
| 799 | 125 | val NumParserInt2 = NumParser.map(_.toInt) | 
| 732 | 126 | |
| 953 | 127 | val x = 1 + 3 | 
| 732 | 128 | |
| 129 | // A parser for palindromes (just returns them as string) | |
| 947 | 130 | // since the parser is recursive it needs to be lazy | 
| 803 | 131 | lazy val Pal : Parser[String, String] = {
 | 
| 959 | 132 |    (p"a" ~ Pal ~ p"a").map{ case ((x, y), z) => s"$x$y$z" } ||
 | 
| 133 |    (p"b" ~ Pal ~ p"b").map{ case ((x, y), z) => s"$x$y$z" } ||
 | |
| 896 | 134 | p"a" || p"b" || p"" | 
| 959 | 135 | } | 
| 732 | 136 | |
| 137 | // examples | |
| 953 | 138 | Pal.parse_all("abacaba")
 | 
| 139 | Pal.parse("abacaaba")
 | |
| 732 | 140 | |
| 141 | println("Palindrome: " + Pal.parse_all("abaaaba"))
 | |
| 142 | ||
| 959 | 143 | // A parser for wellnested parentheses | 
| 799 | 144 | // | 
| 145 | // P ::= ( P ) P | epsilon | |
| 146 | // | |
| 147 | //   (transforms '(' -> '{' , ')' -> '}' )
 | |
| 148 | lazy val P : Parser[String, String] = {
 | |
| 919 | 149 |   (p"(" ~ P ~ p")" ~ P).map{ case (((_, x), _), y) => "{" + x + "}" + y } ||
 | 
| 799 | 150 | p"" | 
| 959 | 151 | } | 
| 732 | 152 | |
| 153 | println(P.parse_all("(((()()))())"))
 | |
| 154 | println(P.parse_all("(((()()))()))"))
 | |
| 155 | println(P.parse_all(")("))
 | |
| 156 | println(P.parse_all("()"))
 | |
| 157 | ||
| 158 | // A parser for arithmetic expressions (Terms and Factors) | |
| 898 | 159 | |
| 799 | 160 | lazy val E: Parser[String, Int] = {
 | 
| 732 | 161 |   (T ~ p"+" ~ E).map{ case ((x, _), z) => x + z } ||
 | 
| 799 | 162 |   (T ~ p"-" ~ E).map{ case ((x, _), z) => x - z } || T }
 | 
| 163 | lazy val T: Parser[String, Int] = {
 | |
| 164 |   (F ~ p"*" ~ T).map{ case ((x, _), z) => x * z } || F }
 | |
| 165 | lazy val F: Parser[String, Int] = {
 | |
| 166 |   (p"(" ~ E ~ p")").map{ case ((_, y), _) => y } || NumParserInt }
 | |
| 955 | 167 | |
| 168 | println(E.parse_all("2*2*2"))
 | |
| 732 | 169 | println(E.parse_all("1+3+4"))
 | 
| 170 | println(E.parse("1+3+4"))
 | |
| 171 | println(E.parse_all("4*2+3"))
 | |
| 172 | println(E.parse_all("4*(2+3)"))
 | |
| 953 | 173 | println(E.parse_all("(4)*(((2+3)))"))
 | 
| 732 | 174 | println(E.parse_all("4/2+3"))
 | 
| 175 | println(E.parse("1 + 2 * 3"))
 | |
| 176 | println(E.parse_all("(1+2)+3"))
 | |
| 799 | 177 | println(E.parse_all("1+2+3"))
 | 
| 732 | 178 | |
| 179 | ||
| 960 | 180 | // with parser combinators (and many other parsing algorithms) | 
| 181 | // no left-recursion is allowed, otherwise the will loop; | |
| 182 | // newer versions of Scala (3.5) will actually give a warning | |
| 183 | // about this | |
| 732 | 184 | |
| 960 | 185 | /* | 
| 959 | 186 | lazy val EL: Parser[String, Int] = | 
| 187 |   ((EL ~ p"+" ~ EL).map{ case ((x, y), z) => x + z} ||
 | |
| 732 | 188 |    (EL ~ p"*" ~ EL).map{ case ((x, y), z) => x * z} ||
 | 
| 189 |    (p"(" ~ EL ~ p")").map{ case ((x, y), z) => y} ||
 | |
| 190 | NumParserInt) | |
| 960 | 191 | */ | 
| 732 | 192 | |
| 193 | // this will run forever: | |
| 194 | //println(EL.parse_all("1+2+3"))
 | |
| 195 | ||
| 196 | ||
| 197 | // non-ambiguous vs ambiguous grammars | |
| 198 | ||
| 199 | // ambiguous | |
| 200 | lazy val S : Parser[String, String] = | |
| 201 |   (p"1" ~ S ~ S).map{ case ((x, y), z) => x + y + z } || p""
 | |
| 202 | ||
| 850 | 203 | //println(time(S.parse("1" * 10)))
 | 
| 204 | //println(time(S.parse_all("1" * 10)))
 | |
| 732 | 205 | |
| 206 | // non-ambiguous | |
| 207 | lazy val U : Parser[String, String] = | |
| 208 |   (p"1" ~ U).map{ case (x, y) => x + y } || p""
 | |
| 209 | ||
| 850 | 210 | //println(time(U.parse("1" * 10)))
 | 
| 211 | //println(time(U.parse_all("1" * 10)))
 | |
| 732 | 212 | println(U.parse("1" * 25))
 | 
| 213 | ||
| 214 | U.parse("11")
 | |
| 215 | U.parse("11111")
 | |
| 216 | U.parse("11011")
 | |
| 217 | ||
| 218 | U.parse_all("1" * 100)
 | |
| 219 | U.parse_all("1" * 100 + "0")
 | |
| 220 | ||
| 221 | // you can see the difference in second example | |
| 222 | //S.parse_all("1" * 100)         // succeeds
 | |
| 223 | //S.parse_all("1" * 100 + "0")   // fails
 | |
| 224 | ||
| 225 | ||
| 226 | // A variant which counts how many 1s are parsed | |
| 227 | lazy val UCount : Parser[String, Int] = | |
| 799 | 228 |   (p"1" ~ UCount).map{ case (_, y) => y + 1 } || p"".map{ _ => 0 }
 | 
| 732 | 229 | |
| 230 | println(UCount.parse("11111"))
 | |
| 231 | println(UCount.parse_all("11111"))
 | |
| 232 | ||
| 233 | // Two single character parsers | |
| 234 | lazy val One : Parser[String, String] = p"a" | |
| 235 | lazy val Two : Parser[String, String] = p"b" | |
| 236 | ||
| 237 | One.parse("a")
 | |
| 238 | One.parse("aaa")
 | |
| 239 | ||
| 240 | // note how the pairs nest to the left with sequence parsers | |
| 241 | (One ~ One).parse("aaa")
 | |
| 242 | (One ~ One ~ One).parse("aaa")
 | |
| 243 | (One ~ One ~ One ~ One).parse("aaaa")
 | |
| 244 | ||
| 245 | (One || Two).parse("aaa")
 | |
| 246 | ||
| 247 | ||
| 248 | ||
| 959 | 249 | // a problem with the arithmetic expression parser: it | 
| 742 | 250 | // gets very slow with deeply nested parentheses | 
| 732 | 251 | |
| 919 | 252 | println("A runtime problem")
 | 
| 732 | 253 | println(E.parse("1"))
 | 
| 254 | println(E.parse("(1)"))
 | |
| 255 | println(E.parse("((1))"))
 | |
| 906 | 256 | println(E.parse("(((1)))"))
 | 
| 257 | println(E.parse("((((1))))"))
 | |
| 919 | 258 | println(E.parse("((((((1))))))"))
 | 
| 259 | println(E.parse("(((((((1)))))))"))
 | |
| 906 | 260 | //println(E.parse("((((((((1))))))))"))
 | 
| 828 | 261 | |
| 262 | ||
| 919 | 263 | // faster because of merge in the +/- case | 
| 906 | 264 | lazy val E2: Parser[String, Int] = {
 | 
| 919 | 265 |   (T2 ~ (p"+" || p"-") ~ E2).map[Int]{ case ((x, y), z) => if (y == "+") x + z else x - z} || T2 }
 | 
| 906 | 266 | lazy val T2: Parser[String, Int] = {
 | 
| 919 | 267 |   (F2 ~ p"*" ~ T2).map[Int]{ case ((x, _), z) => x * z } || F2 }
 | 
| 906 | 268 | lazy val F2: Parser[String, Int] = {
 | 
| 919 | 269 |   (p"(" ~ E2 ~ p")").map[Int]{ case ((_, y), _) => y } || NumParserInt }
 | 
| 828 | 270 | |
| 271 | ||
| 919 | 272 | println("mitigated by merging clauses")
 | 
| 906 | 273 | println(E2.parse("1"))
 | 
| 274 | println(E2.parse("(1)"))
 | |
| 275 | println(E2.parse("((1))"))
 | |
| 276 | println(E2.parse("(((1)))"))
 | |
| 277 | println(E2.parse("((((1))))"))
 | |
| 919 | 278 | println(E2.parse("((((((1))))))"))
 | 
| 279 | println(E2.parse("(((((((1)))))))"))
 | |
| 936 | 280 | println(E2.parse("((((((((1))))))))"))
 | 
| 958 
6caee1c0222e
updated and added pascal.while file
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
955diff
changeset | 281 | |
| 
6caee1c0222e
updated and added pascal.while file
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
955diff
changeset | 282 | |
| 
6caee1c0222e
updated and added pascal.while file
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
955diff
changeset | 283 | |
| 
6caee1c0222e
updated and added pascal.while file
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
955diff
changeset | 284 | |
| 
6caee1c0222e
updated and added pascal.while file
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
955diff
changeset | 285 | |
| 
6caee1c0222e
updated and added pascal.while file
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
955diff
changeset | 286 | /* | 
| 
6caee1c0222e
updated and added pascal.while file
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
955diff
changeset | 287 | Try | 
| 
6caee1c0222e
updated and added pascal.while file
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
955diff
changeset | 288 | |
| 
6caee1c0222e
updated and added pascal.while file
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
955diff
changeset | 289 | 6 / 2 * (2+1) | 
| 
6caee1c0222e
updated and added pascal.while file
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
955diff
changeset | 290 | |
| 
6caee1c0222e
updated and added pascal.while file
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
955diff
changeset | 291 | */ |