| author | Christian Urban <christian.urban@kcl.ac.uk> | 
| Fri, 25 Oct 2024 18:54:08 +0100 | |
| changeset 970 | e15be5466802 | 
| parent 960 | 791f4d9f53e1 | 
| child 971 | b7d97a2a083b | 
| permissions | -rw-r--r-- | 
| 732 | 1  | 
// Parser Combinators: Simple Version  | 
2  | 
//====================================  | 
|
| 742 | 3  | 
//  | 
4  | 
// Call with  | 
|
5  | 
//  | 
|
6  | 
// amm comb1.sc  | 
|
| 732 | 7  | 
|
| 959 | 8  | 
|
| 906 | 9  | 
// Note, in the lectures I did not show the type bound  | 
| 960 | 10  | 
// I : IsSeq, which means that the input type 'I' needs  | 
11  | 
// to be a sequence that can be tested to be empty.  | 
|
12  | 
||
13  | 
trait IsSeq[I] {
 | 
|
14  | 
extension (i: I) def isEmpty: Boolean  | 
|
15  | 
}  | 
|
| 732 | 16  | 
|
| 960 | 17  | 
given IsSeq[String] = _.isEmpty  | 
18  | 
given [I]: IsSeq[Seq[I]] = _.isEmpty  | 
|
19  | 
||
| 940 | 20  | 
|
| 960 | 21  | 
// parser class  | 
22  | 
//==============  | 
|
23  | 
||
24  | 
abstract class Parser[I : IsSeq, T]  {
 | 
|
| 959 | 25  | 
def parse(in: I): Set[(T, I)]  | 
| 732 | 26  | 
|
27  | 
def parse_all(in: I) : Set[T] =  | 
|
| 959 | 28  | 
for ((hd, tl) <- parse(in);  | 
| 960 | 29  | 
if tl.isEmpty) yield hd  | 
| 732 | 30  | 
}  | 
31  | 
||
32  | 
// parser combinators  | 
|
| 960 | 33  | 
//====================  | 
| 940 | 34  | 
|
| 799 | 35  | 
// alternative parser  | 
| 959 | 36  | 
class AltParser[I : IsSeq, T](p: => Parser[I, T],  | 
| 940 | 37  | 
                              q: => Parser[I, T]) extends Parser[I, T] {
 | 
| 959 | 38  | 
def parse(in: I) = p.parse(in) ++ q.parse(in)  | 
| 799 | 39  | 
}  | 
40  | 
||
| 732 | 41  | 
// sequence parser  | 
| 959 | 42  | 
class SeqParser[I: IsSeq, T, S](p: => Parser[I, T],  | 
| 940 | 43  | 
                                q: => Parser[I, S]) extends Parser[I, (T, S)] {
 | 
| 959 | 44  | 
def parse(in: I) =  | 
45  | 
for ((hd1, tl1) <- p.parse(in);  | 
|
| 732 | 46  | 
(hd2, tl2) <- q.parse(tl1)) yield ((hd1, hd2), tl2)  | 
47  | 
}  | 
|
48  | 
||
| 742 | 49  | 
// map parser  | 
| 959 | 50  | 
class MapParser[I : IsSeq, T, S](p: => Parser[I, T],  | 
| 953 | 51  | 
                                 f: T => S) extends Parser[I, S] {
 | 
| 732 | 52  | 
def parse(in: I) = for ((hd, tl) <- p.parse(in)) yield (f(hd), tl)  | 
53  | 
}  | 
|
54  | 
||
| 742 | 55  | 
|
56  | 
||
| 732 | 57  | 
// an example of an atomic parser for characters  | 
58  | 
case class CharParser(c: Char) extends Parser[String, Char] {
 | 
|
| 959 | 59  | 
def parse(in: String) =  | 
| 732 | 60  | 
if (in != "" && in.head == c) Set((c, in.tail)) else Set()  | 
61  | 
}  | 
|
62  | 
||
| 953 | 63  | 
val ap = CharParser('a')
 | 
64  | 
val bp = CharParser('b')
 | 
|
65  | 
||
66  | 
val abp = SeqParser(ap, bp)  | 
|
67  | 
MapParser(abp, ab => s"$ab").parse("abc")
 | 
|
| 799 | 68  | 
|
| 732 | 69  | 
// an atomic parser for parsing strings according to a regex  | 
70  | 
import scala.util.matching.Regex  | 
|
71  | 
||
72  | 
case class RegexParser(reg: Regex) extends Parser[String, String] {
 | 
|
73  | 
  def parse(in: String) = reg.findPrefixMatchOf(in) match {
 | 
|
74  | 
case None => Set()  | 
|
| 959 | 75  | 
case Some(m) => Set((m.matched, m.after.toString))  | 
| 732 | 76  | 
}  | 
77  | 
}  | 
|
78  | 
||
| 959 | 79  | 
// atomic parsers for numbers and "verbatim" strings  | 
| 732 | 80  | 
val NumParser = RegexParser("[0-9]+".r)
 | 
81  | 
def StrParser(s: String) = RegexParser(Regex.quote(s).r)  | 
|
82  | 
||
| 959 | 83  | 
NumParser.parse("123a123bc")
 | 
| 906 | 84  | 
StrParser("else").parse("elsethen")
 | 
| 742 | 85  | 
|
| 799 | 86  | 
|
| 732 | 87  | 
// NumParserInt transforms a "string integer" into a propper Int  | 
88  | 
// (needs "new" because MapParser is not a case class)  | 
|
89  | 
||
| 953 | 90  | 
val NumParserInt = MapParser(NumParser, (s: String) => s.toInt)  | 
| 897 | 91  | 
NumParserInt.parse("123abc")
 | 
| 732 | 92  | 
|
| 959 | 93  | 
// the following string interpolation allows us to write  | 
94  | 
// StrParser(_some_string_) more conveniently as  | 
|
| 732 | 95  | 
//  | 
| 959 | 96  | 
// p"<_some_string_>"  | 
| 732 | 97  | 
|
| 959 | 98  | 
extension (sc: StringContext)  | 
99  | 
def p(args: Any*) = StrParser(sc.s(args*))  | 
|
| 897 | 100  | 
|
101  | 
||
| 959 | 102  | 
(p"else").parse("elsethen")
 | 
| 799 | 103  | 
|
| 732 | 104  | 
// more convenient syntax for parser combinators  | 
| 940 | 105  | 
extension [I: IsSeq, T](p: Parser[I, T]) {
 | 
| 732 | 106  | 
def ||(q : => Parser[I, T]) = new AltParser[I, T](p, q)  | 
107  | 
def ~[S] (q : => Parser[I, S]) = new SeqParser[I, T, S](p, q)  | 
|
| 919 | 108  | 
def map[S](f: => T => S) = new MapParser[I, T, S](p, f)  | 
| 732 | 109  | 
}  | 
110  | 
||
| 959 | 111  | 
// simple example of transforming the  | 
| 947 | 112  | 
// result into capital letters  | 
| 897 | 113  | 
def toU(s: String) = s.map(_.toUpper)  | 
114  | 
||
| 959 | 115  | 
(p"else").map(toU(_)).parse("elseifthen")
 | 
| 897 | 116  | 
|
| 732 | 117  | 
// these implicits allow us to use an infix notation for  | 
118  | 
// sequences and alternatives; we also can write the usual  | 
|
119  | 
// map for a MapParser  | 
|
120  | 
||
121  | 
||
122  | 
// with this NumParserInt can now be written more conveniently  | 
|
123  | 
// as:  | 
|
124  | 
||
| 799 | 125  | 
val NumParserInt2 = NumParser.map(_.toInt)  | 
| 732 | 126  | 
|
| 953 | 127  | 
val x = 1 + 3  | 
| 732 | 128  | 
|
129  | 
// A parser for palindromes (just returns them as string)  | 
|
| 947 | 130  | 
// since the parser is recursive it needs to be lazy  | 
| 803 | 131  | 
lazy val Pal : Parser[String, String] = {
 | 
| 959 | 132  | 
   (p"a" ~ Pal ~ p"a").map{ case ((x, y), z) => s"$x$y$z" } ||
 | 
133  | 
   (p"b" ~ Pal ~ p"b").map{ case ((x, y), z) => s"$x$y$z" } ||
 | 
|
| 896 | 134  | 
p"a" || p"b" || p""  | 
| 959 | 135  | 
}  | 
| 732 | 136  | 
|
137  | 
// examples  | 
|
| 953 | 138  | 
Pal.parse_all("abacaba")
 | 
139  | 
Pal.parse("abacaaba")
 | 
|
| 732 | 140  | 
|
141  | 
println("Palindrome: " + Pal.parse_all("abaaaba"))
 | 
|
142  | 
||
| 959 | 143  | 
// A parser for wellnested parentheses  | 
| 799 | 144  | 
//  | 
145  | 
// P ::= ( P ) P | epsilon  | 
|
146  | 
//  | 
|
147  | 
//   (transforms '(' -> '{' , ')' -> '}' )
 | 
|
148  | 
lazy val P : Parser[String, String] = {
 | 
|
| 919 | 149  | 
  (p"(" ~ P ~ p")" ~ P).map{ case (((_, x), _), y) => "{" + x + "}" + y } ||
 | 
| 799 | 150  | 
p""  | 
| 959 | 151  | 
}  | 
| 732 | 152  | 
|
153  | 
println(P.parse_all("(((()()))())"))
 | 
|
154  | 
println(P.parse_all("(((()()))()))"))
 | 
|
155  | 
println(P.parse_all(")("))
 | 
|
156  | 
println(P.parse_all("()"))
 | 
|
157  | 
||
158  | 
// A parser for arithmetic expressions (Terms and Factors)  | 
|
| 898 | 159  | 
|
| 799 | 160  | 
lazy val E: Parser[String, Int] = {
 | 
| 732 | 161  | 
  (T ~ p"+" ~ E).map{ case ((x, _), z) => x + z } ||
 | 
| 799 | 162  | 
  (T ~ p"-" ~ E).map{ case ((x, _), z) => x - z } || T }
 | 
163  | 
lazy val T: Parser[String, Int] = {
 | 
|
164  | 
  (F ~ p"*" ~ T).map{ case ((x, _), z) => x * z } || F }
 | 
|
165  | 
lazy val F: Parser[String, Int] = {
 | 
|
166  | 
  (p"(" ~ E ~ p")").map{ case ((_, y), _) => y } || NumParserInt }
 | 
|
| 955 | 167  | 
|
168  | 
println(E.parse_all("2*2*2"))
 | 
|
| 732 | 169  | 
println(E.parse_all("1+3+4"))
 | 
170  | 
println(E.parse("1+3+4"))
 | 
|
171  | 
println(E.parse_all("4*2+3"))
 | 
|
172  | 
println(E.parse_all("4*(2+3)"))
 | 
|
| 953 | 173  | 
println(E.parse_all("(4)*(((2+3)))"))
 | 
| 732 | 174  | 
println(E.parse_all("4/2+3"))
 | 
175  | 
println(E.parse("1 + 2 * 3"))
 | 
|
176  | 
println(E.parse_all("(1+2)+3"))
 | 
|
| 799 | 177  | 
println(E.parse_all("1+2+3"))
 | 
| 732 | 178  | 
|
179  | 
||
| 960 | 180  | 
// with parser combinators (and many other parsing algorithms)  | 
181  | 
// no left-recursion is allowed, otherwise the will loop;  | 
|
182  | 
// newer versions of Scala (3.5) will actually give a warning  | 
|
183  | 
// about this  | 
|
| 732 | 184  | 
|
| 960 | 185  | 
/*  | 
| 959 | 186  | 
lazy val EL: Parser[String, Int] =  | 
187  | 
  ((EL ~ p"+" ~ EL).map{ case ((x, y), z) => x + z} ||
 | 
|
| 732 | 188  | 
   (EL ~ p"*" ~ EL).map{ case ((x, y), z) => x * z} ||
 | 
189  | 
   (p"(" ~ EL ~ p")").map{ case ((x, y), z) => y} ||
 | 
|
190  | 
NumParserInt)  | 
|
| 960 | 191  | 
*/  | 
| 732 | 192  | 
|
193  | 
// this will run forever:  | 
|
194  | 
//println(EL.parse_all("1+2+3"))
 | 
|
195  | 
||
196  | 
||
197  | 
// non-ambiguous vs ambiguous grammars  | 
|
198  | 
||
199  | 
// ambiguous  | 
|
200  | 
lazy val S : Parser[String, String] =  | 
|
201  | 
  (p"1" ~ S ~ S).map{ case ((x, y), z) => x + y + z } || p""
 | 
|
202  | 
||
| 850 | 203  | 
//println(time(S.parse("1" * 10)))
 | 
204  | 
//println(time(S.parse_all("1" * 10)))
 | 
|
| 732 | 205  | 
|
206  | 
// non-ambiguous  | 
|
207  | 
lazy val U : Parser[String, String] =  | 
|
208  | 
  (p"1" ~ U).map{ case (x, y) => x + y } || p""
 | 
|
209  | 
||
| 850 | 210  | 
//println(time(U.parse("1" * 10)))
 | 
211  | 
//println(time(U.parse_all("1" * 10)))
 | 
|
| 732 | 212  | 
println(U.parse("1" * 25))
 | 
213  | 
||
214  | 
U.parse("11")
 | 
|
215  | 
U.parse("11111")
 | 
|
216  | 
U.parse("11011")
 | 
|
217  | 
||
218  | 
U.parse_all("1" * 100)
 | 
|
219  | 
U.parse_all("1" * 100 + "0")
 | 
|
220  | 
||
221  | 
// you can see the difference in second example  | 
|
222  | 
//S.parse_all("1" * 100)         // succeeds
 | 
|
223  | 
//S.parse_all("1" * 100 + "0")   // fails
 | 
|
224  | 
||
225  | 
||
226  | 
// A variant which counts how many 1s are parsed  | 
|
227  | 
lazy val UCount : Parser[String, Int] =  | 
|
| 799 | 228  | 
  (p"1" ~ UCount).map{ case (_, y) => y + 1 } || p"".map{ _ => 0 }
 | 
| 732 | 229  | 
|
230  | 
println(UCount.parse("11111"))
 | 
|
231  | 
println(UCount.parse_all("11111"))
 | 
|
232  | 
||
233  | 
// Two single character parsers  | 
|
234  | 
lazy val One : Parser[String, String] = p"a"  | 
|
235  | 
lazy val Two : Parser[String, String] = p"b"  | 
|
236  | 
||
237  | 
One.parse("a")
 | 
|
238  | 
One.parse("aaa")
 | 
|
239  | 
||
240  | 
// note how the pairs nest to the left with sequence parsers  | 
|
241  | 
(One ~ One).parse("aaa")
 | 
|
242  | 
(One ~ One ~ One).parse("aaa")
 | 
|
243  | 
(One ~ One ~ One ~ One).parse("aaaa")
 | 
|
244  | 
||
245  | 
(One || Two).parse("aaa")
 | 
|
246  | 
||
247  | 
||
248  | 
||
| 959 | 249  | 
// a problem with the arithmetic expression parser: it  | 
| 742 | 250  | 
// gets very slow with deeply nested parentheses  | 
| 732 | 251  | 
|
| 919 | 252  | 
println("A runtime problem")
 | 
| 732 | 253  | 
println(E.parse("1"))
 | 
254  | 
println(E.parse("(1)"))
 | 
|
255  | 
println(E.parse("((1))"))
 | 
|
| 906 | 256  | 
println(E.parse("(((1)))"))
 | 
257  | 
println(E.parse("((((1))))"))
 | 
|
| 919 | 258  | 
println(E.parse("((((((1))))))"))
 | 
259  | 
println(E.parse("(((((((1)))))))"))
 | 
|
| 906 | 260  | 
//println(E.parse("((((((((1))))))))"))
 | 
| 828 | 261  | 
|
262  | 
||
| 919 | 263  | 
// faster because of merge in the +/- case  | 
| 906 | 264  | 
lazy val E2: Parser[String, Int] = {
 | 
| 919 | 265  | 
  (T2 ~ (p"+" || p"-") ~ E2).map[Int]{ case ((x, y), z) => if (y == "+") x + z else x - z} || T2 }
 | 
| 906 | 266  | 
lazy val T2: Parser[String, Int] = {
 | 
| 919 | 267  | 
  (F2 ~ p"*" ~ T2).map[Int]{ case ((x, _), z) => x * z } || F2 }
 | 
| 906 | 268  | 
lazy val F2: Parser[String, Int] = {
 | 
| 919 | 269  | 
  (p"(" ~ E2 ~ p")").map[Int]{ case ((_, y), _) => y } || NumParserInt }
 | 
| 828 | 270  | 
|
271  | 
||
| 919 | 272  | 
println("mitigated by merging clauses")
 | 
| 906 | 273  | 
println(E2.parse("1"))
 | 
274  | 
println(E2.parse("(1)"))
 | 
|
275  | 
println(E2.parse("((1))"))
 | 
|
276  | 
println(E2.parse("(((1)))"))
 | 
|
277  | 
println(E2.parse("((((1))))"))
 | 
|
| 919 | 278  | 
println(E2.parse("((((((1))))))"))
 | 
279  | 
println(E2.parse("(((((((1)))))))"))
 | 
|
| 936 | 280  | 
println(E2.parse("((((((((1))))))))"))
 | 
| 
958
 
6caee1c0222e
updated and added pascal.while file
 
Christian Urban <christian.urban@kcl.ac.uk> 
parents: 
955 
diff
changeset
 | 
281  | 
|
| 
 
6caee1c0222e
updated and added pascal.while file
 
Christian Urban <christian.urban@kcl.ac.uk> 
parents: 
955 
diff
changeset
 | 
282  | 
|
| 
 
6caee1c0222e
updated and added pascal.while file
 
Christian Urban <christian.urban@kcl.ac.uk> 
parents: 
955 
diff
changeset
 | 
283  | 
|
| 
 
6caee1c0222e
updated and added pascal.while file
 
Christian Urban <christian.urban@kcl.ac.uk> 
parents: 
955 
diff
changeset
 | 
284  | 
|
| 
 
6caee1c0222e
updated and added pascal.while file
 
Christian Urban <christian.urban@kcl.ac.uk> 
parents: 
955 
diff
changeset
 | 
285  | 
|
| 
 
6caee1c0222e
updated and added pascal.while file
 
Christian Urban <christian.urban@kcl.ac.uk> 
parents: 
955 
diff
changeset
 | 
286  | 
/*  | 
| 
 
6caee1c0222e
updated and added pascal.while file
 
Christian Urban <christian.urban@kcl.ac.uk> 
parents: 
955 
diff
changeset
 | 
287  | 
Try  | 
| 
 
6caee1c0222e
updated and added pascal.while file
 
Christian Urban <christian.urban@kcl.ac.uk> 
parents: 
955 
diff
changeset
 | 
288  | 
|
| 
 
6caee1c0222e
updated and added pascal.while file
 
Christian Urban <christian.urban@kcl.ac.uk> 
parents: 
955 
diff
changeset
 | 
289  | 
6 / 2 * (2+1)  | 
| 
 
6caee1c0222e
updated and added pascal.while file
 
Christian Urban <christian.urban@kcl.ac.uk> 
parents: 
955 
diff
changeset
 | 
290  | 
|
| 
 
6caee1c0222e
updated and added pascal.while file
 
Christian Urban <christian.urban@kcl.ac.uk> 
parents: 
955 
diff
changeset
 | 
291  | 
*/  |