| author | Christian Urban <christian.urban@kcl.ac.uk> | 
| Wed, 29 May 2024 13:25:30 +0100 | |
| changeset 959 | 787ef75ec006 | 
| parent 958 | 6caee1c0222e | 
| child 960 | 791f4d9f53e1 | 
| permissions | -rw-r--r-- | 
| 732 | 1  | 
// Parser Combinators: Simple Version  | 
2  | 
//====================================  | 
|
| 742 | 3  | 
//  | 
4  | 
// Call with  | 
|
5  | 
//  | 
|
6  | 
// amm comb1.sc  | 
|
| 732 | 7  | 
|
| 959 | 8  | 
|
| 906 | 9  | 
// Note, in the lectures I did not show the type bound  | 
| 959 | 10  | 
// using is: I => Seq[_], which means that the input  | 
11  | 
// type 'I' needs to be a sequence.  | 
|
| 732 | 12  | 
|
| 959 | 13  | 
type IsSeq[I] = I => Seq[?]  | 
| 940 | 14  | 
|
| 959 | 15  | 
abstract class Parser[I: IsSeq, T](using is: IsSeq[I])  {
 | 
16  | 
def parse(in: I): Set[(T, I)]  | 
|
| 732 | 17  | 
|
18  | 
def parse_all(in: I) : Set[T] =  | 
|
| 959 | 19  | 
for ((hd, tl) <- parse(in);  | 
| 906 | 20  | 
if is(tl).isEmpty) yield hd  | 
| 732 | 21  | 
}  | 
22  | 
||
| 953 | 23  | 
|
| 732 | 24  | 
// parser combinators  | 
25  | 
||
| 940 | 26  | 
|
| 799 | 27  | 
// alternative parser  | 
| 959 | 28  | 
class AltParser[I : IsSeq, T](p: => Parser[I, T],  | 
| 940 | 29  | 
                              q: => Parser[I, T]) extends Parser[I, T] {
 | 
| 959 | 30  | 
def parse(in: I) = p.parse(in) ++ q.parse(in)  | 
| 799 | 31  | 
}  | 
32  | 
||
| 732 | 33  | 
// sequence parser  | 
| 959 | 34  | 
class SeqParser[I: IsSeq, T, S](p: => Parser[I, T],  | 
| 940 | 35  | 
                                q: => Parser[I, S]) extends Parser[I, (T, S)] {
 | 
| 959 | 36  | 
def parse(in: I) =  | 
37  | 
for ((hd1, tl1) <- p.parse(in);  | 
|
| 732 | 38  | 
(hd2, tl2) <- q.parse(tl1)) yield ((hd1, hd2), tl2)  | 
39  | 
}  | 
|
40  | 
||
| 742 | 41  | 
// map parser  | 
| 959 | 42  | 
class MapParser[I : IsSeq, T, S](p: => Parser[I, T],  | 
| 953 | 43  | 
                                 f: T => S) extends Parser[I, S] {
 | 
| 732 | 44  | 
def parse(in: I) = for ((hd, tl) <- p.parse(in)) yield (f(hd), tl)  | 
45  | 
}  | 
|
46  | 
||
| 742 | 47  | 
|
48  | 
||
| 732 | 49  | 
// an example of an atomic parser for characters  | 
50  | 
case class CharParser(c: Char) extends Parser[String, Char] {
 | 
|
| 959 | 51  | 
def parse(in: String) =  | 
| 732 | 52  | 
if (in != "" && in.head == c) Set((c, in.tail)) else Set()  | 
53  | 
}  | 
|
54  | 
||
| 953 | 55  | 
val ap = CharParser('a')
 | 
56  | 
val bp = CharParser('b')
 | 
|
57  | 
||
58  | 
val abp = SeqParser(ap, bp)  | 
|
59  | 
MapParser(abp, ab => s"$ab").parse("abc")
 | 
|
| 799 | 60  | 
|
| 732 | 61  | 
// an atomic parser for parsing strings according to a regex  | 
62  | 
import scala.util.matching.Regex  | 
|
63  | 
||
64  | 
case class RegexParser(reg: Regex) extends Parser[String, String] {
 | 
|
65  | 
  def parse(in: String) = reg.findPrefixMatchOf(in) match {
 | 
|
66  | 
case None => Set()  | 
|
| 959 | 67  | 
case Some(m) => Set((m.matched, m.after.toString))  | 
| 732 | 68  | 
}  | 
69  | 
}  | 
|
70  | 
||
| 959 | 71  | 
// atomic parsers for numbers and "verbatim" strings  | 
| 732 | 72  | 
val NumParser = RegexParser("[0-9]+".r)
 | 
73  | 
def StrParser(s: String) = RegexParser(Regex.quote(s).r)  | 
|
74  | 
||
| 959 | 75  | 
NumParser.parse("123a123bc")
 | 
| 906 | 76  | 
StrParser("else").parse("elsethen")
 | 
| 742 | 77  | 
|
| 799 | 78  | 
|
| 732 | 79  | 
// NumParserInt transforms a "string integer" into a propper Int  | 
80  | 
// (needs "new" because MapParser is not a case class)  | 
|
81  | 
||
| 953 | 82  | 
val NumParserInt = MapParser(NumParser, (s: String) => s.toInt)  | 
| 897 | 83  | 
NumParserInt.parse("123abc")
 | 
| 732 | 84  | 
|
| 959 | 85  | 
// the following string interpolation allows us to write  | 
86  | 
// StrParser(_some_string_) more conveniently as  | 
|
| 732 | 87  | 
//  | 
| 959 | 88  | 
// p"<_some_string_>"  | 
| 732 | 89  | 
|
| 959 | 90  | 
extension (sc: StringContext)  | 
91  | 
def p(args: Any*) = StrParser(sc.s(args*))  | 
|
| 897 | 92  | 
|
93  | 
||
| 959 | 94  | 
(p"else").parse("elsethen")
 | 
| 799 | 95  | 
|
| 732 | 96  | 
// more convenient syntax for parser combinators  | 
| 940 | 97  | 
extension [I: IsSeq, T](p: Parser[I, T]) {
 | 
| 732 | 98  | 
def ||(q : => Parser[I, T]) = new AltParser[I, T](p, q)  | 
99  | 
def ~[S] (q : => Parser[I, S]) = new SeqParser[I, T, S](p, q)  | 
|
| 919 | 100  | 
def map[S](f: => T => S) = new MapParser[I, T, S](p, f)  | 
| 732 | 101  | 
}  | 
102  | 
||
| 959 | 103  | 
// simple example of transforming the  | 
| 947 | 104  | 
// result into capital letters  | 
| 897 | 105  | 
def toU(s: String) = s.map(_.toUpper)  | 
106  | 
||
| 959 | 107  | 
(p"else").map(toU(_)).parse("elseifthen")
 | 
| 897 | 108  | 
|
| 732 | 109  | 
// these implicits allow us to use an infix notation for  | 
110  | 
// sequences and alternatives; we also can write the usual  | 
|
111  | 
// map for a MapParser  | 
|
112  | 
||
113  | 
||
114  | 
// with this NumParserInt can now be written more conveniently  | 
|
115  | 
// as:  | 
|
116  | 
||
| 799 | 117  | 
val NumParserInt2 = NumParser.map(_.toInt)  | 
| 732 | 118  | 
|
| 953 | 119  | 
val x = 1 + 3  | 
| 732 | 120  | 
|
121  | 
// A parser for palindromes (just returns them as string)  | 
|
| 947 | 122  | 
// since the parser is recursive it needs to be lazy  | 
| 803 | 123  | 
lazy val Pal : Parser[String, String] = {
 | 
| 959 | 124  | 
   (p"a" ~ Pal ~ p"a").map{ case ((x, y), z) => s"$x$y$z" } ||
 | 
125  | 
   (p"b" ~ Pal ~ p"b").map{ case ((x, y), z) => s"$x$y$z" } ||
 | 
|
| 896 | 126  | 
p"a" || p"b" || p""  | 
| 959 | 127  | 
}  | 
| 732 | 128  | 
|
129  | 
// examples  | 
|
| 953 | 130  | 
Pal.parse_all("abacaba")
 | 
131  | 
Pal.parse("abacaaba")
 | 
|
| 732 | 132  | 
|
133  | 
println("Palindrome: " + Pal.parse_all("abaaaba"))
 | 
|
134  | 
||
| 959 | 135  | 
// A parser for wellnested parentheses  | 
| 799 | 136  | 
//  | 
137  | 
// P ::= ( P ) P | epsilon  | 
|
138  | 
//  | 
|
139  | 
//   (transforms '(' -> '{' , ')' -> '}' )
 | 
|
140  | 
lazy val P : Parser[String, String] = {
 | 
|
| 919 | 141  | 
  (p"(" ~ P ~ p")" ~ P).map{ case (((_, x), _), y) => "{" + x + "}" + y } ||
 | 
| 799 | 142  | 
p""  | 
| 959 | 143  | 
}  | 
| 732 | 144  | 
|
145  | 
println(P.parse_all("(((()()))())"))
 | 
|
146  | 
println(P.parse_all("(((()()))()))"))
 | 
|
147  | 
println(P.parse_all(")("))
 | 
|
148  | 
println(P.parse_all("()"))
 | 
|
149  | 
||
150  | 
// A parser for arithmetic expressions (Terms and Factors)  | 
|
| 898 | 151  | 
|
| 799 | 152  | 
lazy val E: Parser[String, Int] = {
 | 
| 732 | 153  | 
  (T ~ p"+" ~ E).map{ case ((x, _), z) => x + z } ||
 | 
| 799 | 154  | 
  (T ~ p"-" ~ E).map{ case ((x, _), z) => x - z } || T }
 | 
155  | 
lazy val T: Parser[String, Int] = {
 | 
|
156  | 
  (F ~ p"*" ~ T).map{ case ((x, _), z) => x * z } || F }
 | 
|
157  | 
lazy val F: Parser[String, Int] = {
 | 
|
158  | 
  (p"(" ~ E ~ p")").map{ case ((_, y), _) => y } || NumParserInt }
 | 
|
| 955 | 159  | 
|
160  | 
println(E.parse_all("2*2*2"))
 | 
|
| 732 | 161  | 
println(E.parse_all("1+3+4"))
 | 
162  | 
println(E.parse("1+3+4"))
 | 
|
163  | 
println(E.parse_all("4*2+3"))
 | 
|
164  | 
println(E.parse_all("4*(2+3)"))
 | 
|
| 953 | 165  | 
println(E.parse_all("(4)*(((2+3)))"))
 | 
| 732 | 166  | 
println(E.parse_all("4/2+3"))
 | 
167  | 
println(E.parse("1 + 2 * 3"))
 | 
|
168  | 
println(E.parse_all("(1+2)+3"))
 | 
|
| 799 | 169  | 
println(E.parse_all("1+2+3"))
 | 
| 732 | 170  | 
|
171  | 
||
172  | 
// with parser combinators (and other parsing algorithms)  | 
|
173  | 
// no left-recursion is allowed, otherwise the will loop  | 
|
174  | 
||
| 959 | 175  | 
lazy val EL: Parser[String, Int] =  | 
176  | 
  ((EL ~ p"+" ~ EL).map{ case ((x, y), z) => x + z} ||
 | 
|
| 732 | 177  | 
   (EL ~ p"*" ~ EL).map{ case ((x, y), z) => x * z} ||
 | 
178  | 
   (p"(" ~ EL ~ p")").map{ case ((x, y), z) => y} ||
 | 
|
179  | 
NumParserInt)  | 
|
180  | 
||
181  | 
// this will run forever:  | 
|
182  | 
//println(EL.parse_all("1+2+3"))
 | 
|
183  | 
||
184  | 
||
185  | 
// non-ambiguous vs ambiguous grammars  | 
|
186  | 
||
187  | 
// ambiguous  | 
|
188  | 
lazy val S : Parser[String, String] =  | 
|
189  | 
  (p"1" ~ S ~ S).map{ case ((x, y), z) => x + y + z } || p""
 | 
|
190  | 
||
| 850 | 191  | 
//println(time(S.parse("1" * 10)))
 | 
192  | 
//println(time(S.parse_all("1" * 10)))
 | 
|
| 732 | 193  | 
|
194  | 
// non-ambiguous  | 
|
195  | 
lazy val U : Parser[String, String] =  | 
|
196  | 
  (p"1" ~ U).map{ case (x, y) => x + y } || p""
 | 
|
197  | 
||
| 850 | 198  | 
//println(time(U.parse("1" * 10)))
 | 
199  | 
//println(time(U.parse_all("1" * 10)))
 | 
|
| 732 | 200  | 
println(U.parse("1" * 25))
 | 
201  | 
||
202  | 
U.parse("11")
 | 
|
203  | 
U.parse("11111")
 | 
|
204  | 
U.parse("11011")
 | 
|
205  | 
||
206  | 
U.parse_all("1" * 100)
 | 
|
207  | 
U.parse_all("1" * 100 + "0")
 | 
|
208  | 
||
209  | 
// you can see the difference in second example  | 
|
210  | 
//S.parse_all("1" * 100)         // succeeds
 | 
|
211  | 
//S.parse_all("1" * 100 + "0")   // fails
 | 
|
212  | 
||
213  | 
||
214  | 
// A variant which counts how many 1s are parsed  | 
|
215  | 
lazy val UCount : Parser[String, Int] =  | 
|
| 799 | 216  | 
  (p"1" ~ UCount).map{ case (_, y) => y + 1 } || p"".map{ _ => 0 }
 | 
| 732 | 217  | 
|
218  | 
println(UCount.parse("11111"))
 | 
|
219  | 
println(UCount.parse_all("11111"))
 | 
|
220  | 
||
221  | 
// Two single character parsers  | 
|
222  | 
lazy val One : Parser[String, String] = p"a"  | 
|
223  | 
lazy val Two : Parser[String, String] = p"b"  | 
|
224  | 
||
225  | 
One.parse("a")
 | 
|
226  | 
One.parse("aaa")
 | 
|
227  | 
||
228  | 
// note how the pairs nest to the left with sequence parsers  | 
|
229  | 
(One ~ One).parse("aaa")
 | 
|
230  | 
(One ~ One ~ One).parse("aaa")
 | 
|
231  | 
(One ~ One ~ One ~ One).parse("aaaa")
 | 
|
232  | 
||
233  | 
(One || Two).parse("aaa")
 | 
|
234  | 
||
235  | 
||
236  | 
||
| 959 | 237  | 
// a problem with the arithmetic expression parser: it  | 
| 742 | 238  | 
// gets very slow with deeply nested parentheses  | 
| 732 | 239  | 
|
| 919 | 240  | 
println("A runtime problem")
 | 
| 732 | 241  | 
println(E.parse("1"))
 | 
242  | 
println(E.parse("(1)"))
 | 
|
243  | 
println(E.parse("((1))"))
 | 
|
| 906 | 244  | 
println(E.parse("(((1)))"))
 | 
245  | 
println(E.parse("((((1))))"))
 | 
|
| 919 | 246  | 
println(E.parse("((((((1))))))"))
 | 
247  | 
println(E.parse("(((((((1)))))))"))
 | 
|
| 906 | 248  | 
//println(E.parse("((((((((1))))))))"))
 | 
| 828 | 249  | 
|
250  | 
||
| 919 | 251  | 
// faster because of merge in the +/- case  | 
| 906 | 252  | 
lazy val E2: Parser[String, Int] = {
 | 
| 919 | 253  | 
  (T2 ~ (p"+" || p"-") ~ E2).map[Int]{ case ((x, y), z) => if (y == "+") x + z else x - z} || T2 }
 | 
| 906 | 254  | 
lazy val T2: Parser[String, Int] = {
 | 
| 919 | 255  | 
  (F2 ~ p"*" ~ T2).map[Int]{ case ((x, _), z) => x * z } || F2 }
 | 
| 906 | 256  | 
lazy val F2: Parser[String, Int] = {
 | 
| 919 | 257  | 
  (p"(" ~ E2 ~ p")").map[Int]{ case ((_, y), _) => y } || NumParserInt }
 | 
| 828 | 258  | 
|
259  | 
||
| 919 | 260  | 
println("mitigated by merging clauses")
 | 
| 906 | 261  | 
println(E2.parse("1"))
 | 
262  | 
println(E2.parse("(1)"))
 | 
|
263  | 
println(E2.parse("((1))"))
 | 
|
264  | 
println(E2.parse("(((1)))"))
 | 
|
265  | 
println(E2.parse("((((1))))"))
 | 
|
| 919 | 266  | 
println(E2.parse("((((((1))))))"))
 | 
267  | 
println(E2.parse("(((((((1)))))))"))
 | 
|
| 936 | 268  | 
println(E2.parse("((((((((1))))))))"))
 | 
| 
958
 
6caee1c0222e
updated and added pascal.while file
 
Christian Urban <christian.urban@kcl.ac.uk> 
parents: 
955 
diff
changeset
 | 
269  | 
|
| 
 
6caee1c0222e
updated and added pascal.while file
 
Christian Urban <christian.urban@kcl.ac.uk> 
parents: 
955 
diff
changeset
 | 
270  | 
|
| 
 
6caee1c0222e
updated and added pascal.while file
 
Christian Urban <christian.urban@kcl.ac.uk> 
parents: 
955 
diff
changeset
 | 
271  | 
|
| 
 
6caee1c0222e
updated and added pascal.while file
 
Christian Urban <christian.urban@kcl.ac.uk> 
parents: 
955 
diff
changeset
 | 
272  | 
|
| 
 
6caee1c0222e
updated and added pascal.while file
 
Christian Urban <christian.urban@kcl.ac.uk> 
parents: 
955 
diff
changeset
 | 
273  | 
|
| 
 
6caee1c0222e
updated and added pascal.while file
 
Christian Urban <christian.urban@kcl.ac.uk> 
parents: 
955 
diff
changeset
 | 
274  | 
/*  | 
| 
 
6caee1c0222e
updated and added pascal.while file
 
Christian Urban <christian.urban@kcl.ac.uk> 
parents: 
955 
diff
changeset
 | 
275  | 
Try  | 
| 
 
6caee1c0222e
updated and added pascal.while file
 
Christian Urban <christian.urban@kcl.ac.uk> 
parents: 
955 
diff
changeset
 | 
276  | 
|
| 
 
6caee1c0222e
updated and added pascal.while file
 
Christian Urban <christian.urban@kcl.ac.uk> 
parents: 
955 
diff
changeset
 | 
277  | 
6 / 2 * (2+1)  | 
| 
 
6caee1c0222e
updated and added pascal.while file
 
Christian Urban <christian.urban@kcl.ac.uk> 
parents: 
955 
diff
changeset
 | 
278  | 
|
| 
 
6caee1c0222e
updated and added pascal.while file
 
Christian Urban <christian.urban@kcl.ac.uk> 
parents: 
955 
diff
changeset
 | 
279  | 
*/  |