| author | Christian Urban <christian.urban@kcl.ac.uk> | 
| Wed, 21 Feb 2024 09:14:12 +0000 | |
| changeset 958 | 6caee1c0222e | 
| parent 955 | ca67172b8fa1 | 
| child 959 | 787ef75ec006 | 
| permissions | -rw-r--r-- | 
| 732 | 1 | // Parser Combinators: Simple Version | 
| 2 | //==================================== | |
| 742 | 3 | // | 
| 4 | // Call with | |
| 5 | // | |
| 6 | // amm comb1.sc | |
| 732 | 7 | |
| 742 | 8 | |
| 906 | 9 | // Note, in the lectures I did not show the type bound | 
| 10 | // using is: I => Seq[_], which means that the input | |
| 11 | // type 'I' needs to be a sequence. | |
| 732 | 12 | |
| 940 | 13 | type IsSeq[I] = I => Seq[_] | 
| 14 | ||
| 15 | abstract class Parser[I, T](using is: IsSeq[I])  {
 | |
| 852 | 16 | def parse(in: I): Set[(T, I)] | 
| 732 | 17 | |
| 18 | def parse_all(in: I) : Set[T] = | |
| 19 | for ((hd, tl) <- parse(in); | |
| 906 | 20 | if is(tl).isEmpty) yield hd | 
| 732 | 21 | } | 
| 22 | ||
| 953 | 23 | |
| 732 | 24 | // parser combinators | 
| 25 | ||
| 940 | 26 | |
| 799 | 27 | // alternative parser | 
| 940 | 28 | class AltParser[I : IsSeq, T](p: => Parser[I, T], | 
| 29 |                               q: => Parser[I, T]) extends Parser[I, T] {
 | |
| 799 | 30 | def parse(in: I) = p.parse(in) ++ q.parse(in) | 
| 31 | } | |
| 32 | ||
| 732 | 33 | // sequence parser | 
| 940 | 34 | class SeqParser[I: IsSeq, T, S](p: => Parser[I, T], | 
| 35 |                                 q: => Parser[I, S]) extends Parser[I, (T, S)] {
 | |
| 732 | 36 | def parse(in: I) = | 
| 37 | for ((hd1, tl1) <- p.parse(in); | |
| 38 | (hd2, tl2) <- q.parse(tl1)) yield ((hd1, hd2), tl2) | |
| 39 | } | |
| 40 | ||
| 742 | 41 | // map parser | 
| 940 | 42 | class MapParser[I : IsSeq, T, S](p: => Parser[I, T], | 
| 953 | 43 |                                  f: T => S) extends Parser[I, S] {
 | 
| 732 | 44 | def parse(in: I) = for ((hd, tl) <- p.parse(in)) yield (f(hd), tl) | 
| 45 | } | |
| 46 | ||
| 742 | 47 | |
| 48 | ||
| 732 | 49 | // an example of an atomic parser for characters | 
| 50 | case class CharParser(c: Char) extends Parser[String, Char] {
 | |
| 51 | def parse(in: String) = | |
| 52 | if (in != "" && in.head == c) Set((c, in.tail)) else Set() | |
| 53 | } | |
| 54 | ||
| 953 | 55 | val ap = CharParser('a')
 | 
| 56 | val bp = CharParser('b')
 | |
| 57 | ||
| 58 | val abp = SeqParser(ap, bp) | |
| 59 | MapParser(abp, ab => s"$ab").parse("abc")
 | |
| 799 | 60 | |
| 732 | 61 | // an atomic parser for parsing strings according to a regex | 
| 62 | import scala.util.matching.Regex | |
| 63 | ||
| 64 | case class RegexParser(reg: Regex) extends Parser[String, String] {
 | |
| 65 |   def parse(in: String) = reg.findPrefixMatchOf(in) match {
 | |
| 66 | case None => Set() | |
| 67 | case Some(m) => Set((m.matched, m.after.toString)) | |
| 68 | } | |
| 69 | } | |
| 70 | ||
| 71 | // atomic parsers for numbers and "verbatim" strings | |
| 72 | val NumParser = RegexParser("[0-9]+".r)
 | |
| 73 | def StrParser(s: String) = RegexParser(Regex.quote(s).r) | |
| 74 | ||
| 953 | 75 | NumParser.parse("123a123bc") 
 | 
| 906 | 76 | StrParser("else").parse("elsethen")
 | 
| 742 | 77 | |
| 799 | 78 | |
| 732 | 79 | // NumParserInt transforms a "string integer" into a propper Int | 
| 80 | // (needs "new" because MapParser is not a case class) | |
| 81 | ||
| 953 | 82 | val NumParserInt = MapParser(NumParser, (s: String) => s.toInt) | 
| 897 | 83 | NumParserInt.parse("123abc")
 | 
| 732 | 84 | |
| 85 | // the following string interpolation allows us to write | |
| 86 | // StrParser(_some_string_) more conveniently as | |
| 87 | // | |
| 88 | // p"<_some_string_>" | |
| 89 | ||
| 906 | 90 | extension (sc: StringContext) | 
| 91 | def p(args: Any*) = StrParser(sc.s(args:_*)) | |
| 897 | 92 | |
| 93 | ||
| 94 | (p"else").parse("elsethen")           
 | |
| 799 | 95 | |
| 732 | 96 | // more convenient syntax for parser combinators | 
| 940 | 97 | extension [I: IsSeq, T](p: Parser[I, T]) {
 | 
| 732 | 98 | def ||(q : => Parser[I, T]) = new AltParser[I, T](p, q) | 
| 99 | def ~[S] (q : => Parser[I, S]) = new SeqParser[I, T, S](p, q) | |
| 919 | 100 | def map[S](f: => T => S) = new MapParser[I, T, S](p, f) | 
| 732 | 101 | } | 
| 102 | ||
| 947 | 103 | // simple example of transforming the | 
| 104 | // result into capital letters | |
| 897 | 105 | def toU(s: String) = s.map(_.toUpper) | 
| 106 | ||
| 947 | 107 | (p"else").map(toU(_)).parse("elseifthen")  
 | 
| 897 | 108 | |
| 732 | 109 | // these implicits allow us to use an infix notation for | 
| 110 | // sequences and alternatives; we also can write the usual | |
| 111 | // map for a MapParser | |
| 112 | ||
| 113 | ||
| 114 | // with this NumParserInt can now be written more conveniently | |
| 115 | // as: | |
| 116 | ||
| 799 | 117 | val NumParserInt2 = NumParser.map(_.toInt) | 
| 732 | 118 | |
| 953 | 119 | val x = 1 + 3 | 
| 732 | 120 | |
| 121 | // A parser for palindromes (just returns them as string) | |
| 947 | 122 | // since the parser is recursive it needs to be lazy | 
| 803 | 123 | lazy val Pal : Parser[String, String] = {
 | 
| 919 | 124 |    (p"a" ~ Pal ~ p"a").map{ case ((x, y), z) => s"$x$y$z" } || 
 | 
| 125 |    (p"b" ~ Pal ~ p"b").map{ case ((x, y), z) => s"$x$y$z" } || 
 | |
| 896 | 126 | p"a" || p"b" || p"" | 
| 732 | 127 | } | 
| 128 | ||
| 129 | // examples | |
| 953 | 130 | Pal.parse_all("abacaba")
 | 
| 131 | Pal.parse("abacaaba")
 | |
| 732 | 132 | |
| 133 | println("Palindrome: " + Pal.parse_all("abaaaba"))
 | |
| 134 | ||
| 799 | 135 | // A parser for wellnested parentheses | 
| 136 | // | |
| 137 | // P ::= ( P ) P | epsilon | |
| 138 | // | |
| 139 | //   (transforms '(' -> '{' , ')' -> '}' )
 | |
| 140 | lazy val P : Parser[String, String] = {
 | |
| 919 | 141 |   (p"(" ~ P ~ p")" ~ P).map{ case (((_, x), _), y) => "{" + x + "}" + y } ||
 | 
| 799 | 142 | p"" | 
| 143 | } | |
| 732 | 144 | |
| 145 | println(P.parse_all("(((()()))())"))
 | |
| 146 | println(P.parse_all("(((()()))()))"))
 | |
| 147 | println(P.parse_all(")("))
 | |
| 148 | println(P.parse_all("()"))
 | |
| 149 | ||
| 150 | // A parser for arithmetic expressions (Terms and Factors) | |
| 898 | 151 | |
| 799 | 152 | lazy val E: Parser[String, Int] = {
 | 
| 732 | 153 |   (T ~ p"+" ~ E).map{ case ((x, _), z) => x + z } ||
 | 
| 799 | 154 |   (T ~ p"-" ~ E).map{ case ((x, _), z) => x - z } || T }
 | 
| 155 | lazy val T: Parser[String, Int] = {
 | |
| 156 |   (F ~ p"*" ~ T).map{ case ((x, _), z) => x * z } || F }
 | |
| 157 | lazy val F: Parser[String, Int] = {
 | |
| 158 |   (p"(" ~ E ~ p")").map{ case ((_, y), _) => y } || NumParserInt }
 | |
| 955 | 159 | |
| 160 | println(E.parse_all("2*2*2"))
 | |
| 732 | 161 | println(E.parse_all("1+3+4"))
 | 
| 162 | println(E.parse("1+3+4"))
 | |
| 163 | println(E.parse_all("4*2+3"))
 | |
| 164 | println(E.parse_all("4*(2+3)"))
 | |
| 953 | 165 | println(E.parse_all("(4)*(((2+3)))"))
 | 
| 732 | 166 | println(E.parse_all("4/2+3"))
 | 
| 167 | println(E.parse("1 + 2 * 3"))
 | |
| 168 | println(E.parse_all("(1+2)+3"))
 | |
| 799 | 169 | println(E.parse_all("1+2+3"))
 | 
| 732 | 170 | |
| 171 | ||
| 172 | // with parser combinators (and other parsing algorithms) | |
| 173 | // no left-recursion is allowed, otherwise the will loop | |
| 174 | ||
| 175 | lazy val EL: Parser[String, Int] = | |
| 176 |   ((EL ~ p"+" ~ EL).map{ case ((x, y), z) => x + z} || 
 | |
| 177 |    (EL ~ p"*" ~ EL).map{ case ((x, y), z) => x * z} ||
 | |
| 178 |    (p"(" ~ EL ~ p")").map{ case ((x, y), z) => y} ||
 | |
| 179 | NumParserInt) | |
| 180 | ||
| 181 | // this will run forever: | |
| 182 | //println(EL.parse_all("1+2+3"))
 | |
| 183 | ||
| 184 | ||
| 185 | // non-ambiguous vs ambiguous grammars | |
| 186 | ||
| 187 | // ambiguous | |
| 188 | lazy val S : Parser[String, String] = | |
| 189 |   (p"1" ~ S ~ S).map{ case ((x, y), z) => x + y + z } || p""
 | |
| 190 | ||
| 850 | 191 | //println(time(S.parse("1" * 10)))
 | 
| 192 | //println(time(S.parse_all("1" * 10)))
 | |
| 732 | 193 | |
| 194 | // non-ambiguous | |
| 195 | lazy val U : Parser[String, String] = | |
| 196 |   (p"1" ~ U).map{ case (x, y) => x + y } || p""
 | |
| 197 | ||
| 850 | 198 | //println(time(U.parse("1" * 10)))
 | 
| 199 | //println(time(U.parse_all("1" * 10)))
 | |
| 732 | 200 | println(U.parse("1" * 25))
 | 
| 201 | ||
| 202 | U.parse("11")
 | |
| 203 | U.parse("11111")
 | |
| 204 | U.parse("11011")
 | |
| 205 | ||
| 206 | U.parse_all("1" * 100)
 | |
| 207 | U.parse_all("1" * 100 + "0")
 | |
| 208 | ||
| 209 | // you can see the difference in second example | |
| 210 | //S.parse_all("1" * 100)         // succeeds
 | |
| 211 | //S.parse_all("1" * 100 + "0")   // fails
 | |
| 212 | ||
| 213 | ||
| 214 | // A variant which counts how many 1s are parsed | |
| 215 | lazy val UCount : Parser[String, Int] = | |
| 799 | 216 |   (p"1" ~ UCount).map{ case (_, y) => y + 1 } || p"".map{ _ => 0 }
 | 
| 732 | 217 | |
| 218 | println(UCount.parse("11111"))
 | |
| 219 | println(UCount.parse_all("11111"))
 | |
| 220 | ||
| 221 | // Two single character parsers | |
| 222 | lazy val One : Parser[String, String] = p"a" | |
| 223 | lazy val Two : Parser[String, String] = p"b" | |
| 224 | ||
| 225 | One.parse("a")
 | |
| 226 | One.parse("aaa")
 | |
| 227 | ||
| 228 | // note how the pairs nest to the left with sequence parsers | |
| 229 | (One ~ One).parse("aaa")
 | |
| 230 | (One ~ One ~ One).parse("aaa")
 | |
| 231 | (One ~ One ~ One ~ One).parse("aaaa")
 | |
| 232 | ||
| 233 | (One || Two).parse("aaa")
 | |
| 234 | ||
| 235 | ||
| 236 | ||
| 237 | // a problem with the arithmetic expression parser: it | |
| 742 | 238 | // gets very slow with deeply nested parentheses | 
| 732 | 239 | |
| 919 | 240 | println("A runtime problem")
 | 
| 732 | 241 | println(E.parse("1"))
 | 
| 242 | println(E.parse("(1)"))
 | |
| 243 | println(E.parse("((1))"))
 | |
| 906 | 244 | println(E.parse("(((1)))"))
 | 
| 245 | println(E.parse("((((1))))"))
 | |
| 919 | 246 | println(E.parse("((((((1))))))"))
 | 
| 247 | println(E.parse("(((((((1)))))))"))
 | |
| 906 | 248 | //println(E.parse("((((((((1))))))))"))
 | 
| 828 | 249 | |
| 250 | ||
| 919 | 251 | // faster because of merge in the +/- case | 
| 906 | 252 | lazy val E2: Parser[String, Int] = {
 | 
| 919 | 253 |   (T2 ~ (p"+" || p"-") ~ E2).map[Int]{ case ((x, y), z) => if (y == "+") x + z else x - z} || T2 }
 | 
| 906 | 254 | lazy val T2: Parser[String, Int] = {
 | 
| 919 | 255 |   (F2 ~ p"*" ~ T2).map[Int]{ case ((x, _), z) => x * z } || F2 }
 | 
| 906 | 256 | lazy val F2: Parser[String, Int] = {
 | 
| 919 | 257 |   (p"(" ~ E2 ~ p")").map[Int]{ case ((_, y), _) => y } || NumParserInt }
 | 
| 828 | 258 | |
| 259 | ||
| 919 | 260 | println("mitigated by merging clauses")
 | 
| 906 | 261 | println(E2.parse("1"))
 | 
| 262 | println(E2.parse("(1)"))
 | |
| 263 | println(E2.parse("((1))"))
 | |
| 264 | println(E2.parse("(((1)))"))
 | |
| 265 | println(E2.parse("((((1))))"))
 | |
| 919 | 266 | println(E2.parse("((((((1))))))"))
 | 
| 267 | println(E2.parse("(((((((1)))))))"))
 | |
| 936 | 268 | println(E2.parse("((((((((1))))))))"))
 | 
| 958 
6caee1c0222e
updated and added pascal.while file
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
955diff
changeset | 269 | |
| 
6caee1c0222e
updated and added pascal.while file
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
955diff
changeset | 270 | |
| 
6caee1c0222e
updated and added pascal.while file
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
955diff
changeset | 271 | |
| 
6caee1c0222e
updated and added pascal.while file
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
955diff
changeset | 272 | |
| 
6caee1c0222e
updated and added pascal.while file
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
955diff
changeset | 273 | |
| 
6caee1c0222e
updated and added pascal.while file
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
955diff
changeset | 274 | /* | 
| 
6caee1c0222e
updated and added pascal.while file
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
955diff
changeset | 275 | Try | 
| 
6caee1c0222e
updated and added pascal.while file
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
955diff
changeset | 276 | |
| 
6caee1c0222e
updated and added pascal.while file
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
955diff
changeset | 277 | 6 / 2 * (2+1) | 
| 
6caee1c0222e
updated and added pascal.while file
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
955diff
changeset | 278 | |
| 
6caee1c0222e
updated and added pascal.while file
 Christian Urban <christian.urban@kcl.ac.uk> parents: 
955diff
changeset | 279 | */ |