progs/parser-combinators/comb1-2.sc
changeset 906 2bf1516d730f
parent 898 45a48c47dcca
equal deleted inserted replaced
905:15973df32613 906:2bf1516d730f
       
     1 // Parser Combinators: Simple Version
       
     2 //====================================
       
     3 //
       
     4 // Call with Ammonite (Scala 2.13.10)
       
     5 //
       
     6 //  amm comb1-2.sc
       
     7 
       
     8  
       
     9 //  Note, in the lectures I did not show the implicit type bound
       
    10 //  I : IsSeq, which means that the input type 'I' needs to be
       
    11 //  a sequence. 
       
    12 
       
    13 type IsSeq[A] = A => Seq[_]
       
    14 
       
    15 abstract class Parser[I : IsSeq, T]{
       
    16   def parse(in: I): Set[(T, I)]  
       
    17 
       
    18   def parse_all(in: I) : Set[T] =
       
    19     for ((hd, tl) <- parse(in); 
       
    20         if tl.isEmpty) yield hd
       
    21 }
       
    22 
       
    23 // parser combinators
       
    24 
       
    25 // alternative parser
       
    26 class AltParser[I : IsSeq, T](p: => Parser[I, T], 
       
    27                               q: => Parser[I, T]) extends Parser[I, T] {
       
    28   def parse(in: I) = p.parse(in) ++ q.parse(in)   
       
    29 }
       
    30 
       
    31 // sequence parser
       
    32 class SeqParser[I : IsSeq, T, S](p: => Parser[I, T], 
       
    33                                  q: => Parser[I, S]) extends Parser[I, (T, S)] {
       
    34   def parse(in: I) = 
       
    35     for ((hd1, tl1) <- p.parse(in); 
       
    36          (hd2, tl2) <- q.parse(tl1)) yield ((hd1, hd2), tl2)
       
    37 }
       
    38 
       
    39 // map parser
       
    40 class MapParser[I : IsSeq, T, S](p: => Parser[I, T], 
       
    41                                  f: T => S) extends Parser[I, S] {
       
    42   def parse(in: I) = for ((hd, tl) <- p.parse(in)) yield (f(hd), tl)
       
    43 }
       
    44 
       
    45 
       
    46 
       
    47 // an example of an atomic parser for characters
       
    48 case class CharParser(c: Char) extends Parser[String, Char] {
       
    49   def parse(in: String) = 
       
    50     if (in != "" && in.head == c) Set((c, in.tail)) else Set()
       
    51 }
       
    52 
       
    53 CharParser('c').parse("abc")
       
    54 
       
    55 // an atomic parser for parsing strings according to a regex
       
    56 import scala.util.matching.Regex
       
    57 
       
    58 case class RegexParser(reg: Regex) extends Parser[String, String] {
       
    59   def parse(in: String) = reg.findPrefixMatchOf(in) match {
       
    60     case None => Set()
       
    61     case Some(m) => Set((m.matched, m.after.toString))  
       
    62   }
       
    63 }
       
    64 
       
    65 // atomic parsers for numbers and "verbatim" strings 
       
    66 val NumParser = RegexParser("[0-9]+".r)
       
    67 def StrParser(s: String) = RegexParser(Regex.quote(s).r)
       
    68 
       
    69 NumParser.parse("a123a123bc") 
       
    70 StrParser("else").parse("eelsethen")
       
    71 
       
    72 
       
    73 // NumParserInt transforms a "string integer" into a proper Int
       
    74 // (needs "new" because MapParser is not a case class)
       
    75 
       
    76 val NumParserInt = new MapParser(NumParser, (s: String) => s.toInt)
       
    77 NumParserInt.parse("123abc")
       
    78 
       
    79 // the following string interpolation allows us to write 
       
    80 // StrParser(_some_string_) more conveniently as 
       
    81 //
       
    82 // p"<_some_string_>" 
       
    83 
       
    84 
       
    85 implicit def parser_interpolation(sc: StringContext) = new {
       
    86   def p(args: Any*) = StrParser(sc.s(args:_*))
       
    87 }
       
    88 
       
    89 (p"else").parse("elsethen")           
       
    90 
       
    91 // more convenient syntax for parser combinators
       
    92 
       
    93 implicit def ParserOps[I : IsSeq, T](p: Parser[I, T]) = new {
       
    94   def ||(q : => Parser[I, T]) = new AltParser[I, T](p, q)
       
    95   def ~[S] (q : => Parser[I, S]) = new SeqParser[I, T, S](p, q)
       
    96   def map[S](f: => T => S) = new MapParser[I, T, S](p, f)
       
    97 }
       
    98 
       
    99 // example
       
   100 def toU(s: String) = s.map(_.toUpper)
       
   101 (p"ELSE").map(toU(_)).parse("ELSEifthen")  
       
   102 
       
   103 // these implicits allow us to use an infix notation for
       
   104 // sequences and alternatives; we also can write the usual
       
   105 // map for a MapParser
       
   106 
       
   107 
       
   108 // with this NumParserInt can now be written more conveniently
       
   109 // as:
       
   110 
       
   111 val NumParserInt2 = NumParser.map(_.toInt)
       
   112 
       
   113 
       
   114 
       
   115 // A parser for palindromes (just returns them as string)
       
   116 lazy val Pal : Parser[String, String] = {
       
   117    (p"a" ~ Pal ~ p"a").map{ case ((x, y), z) => s"$x$y$z" } || 
       
   118    (p"b" ~ Pal ~ p"b").map{ case ((x, y), z) => s"$x$y$z" } || 
       
   119     p"a" || p"b" || p""
       
   120 }  
       
   121 
       
   122 // examples
       
   123 Pal.parse_all("abaaaba")
       
   124 Pal.parse("abaaaba")
       
   125 
       
   126 println("Palindrome: " + Pal.parse_all("abaaaba"))
       
   127 
       
   128 // A parser for wellnested parentheses 
       
   129 //
       
   130 //   P ::= ( P ) P | epsilon
       
   131 //
       
   132 //   (transforms '(' -> '{' , ')' -> '}' )
       
   133 lazy val P : Parser[String, String] = {
       
   134   (p"(" ~ P ~ p")" ~ P).map{ case (((_, x), _), y) => "{" + x + "}" + y } ||
       
   135   p""
       
   136 }  
       
   137 
       
   138 println(P.parse_all("(((()()))())"))
       
   139 println(P.parse_all("(((()()))()))"))
       
   140 println(P.parse_all(")("))
       
   141 println(P.parse_all("()"))
       
   142 
       
   143 // A parser for arithmetic expressions (Terms and Factors)
       
   144 
       
   145 lazy val E: Parser[String, Int] = {
       
   146   (T ~ p"+" ~ E).map{ case ((x, _), z) => x + z } ||
       
   147   (T ~ p"-" ~ E).map{ case ((x, _), z) => x - z } || T }
       
   148 lazy val T: Parser[String, Int] = {
       
   149   (F ~ p"*" ~ T).map{ case ((x, _), z) => x * z } || F }
       
   150 lazy val F: Parser[String, Int] = {
       
   151   (p"(" ~ E ~ p")").map{ case ((_, y), _) => y } || NumParserInt }
       
   152 
       
   153 println(E.parse_all("1+3+4"))
       
   154 println(E.parse("1+3+4"))
       
   155 println(E.parse_all("4*2+3"))
       
   156 println(E.parse_all("4*(2+3)"))
       
   157 println(E.parse_all("(4)*((2+3))"))
       
   158 println(E.parse_all("4/2+3"))
       
   159 println(E.parse("1 + 2 * 3"))
       
   160 println(E.parse_all("(1+2)+3"))
       
   161 println(E.parse_all("1+2+3"))
       
   162 
       
   163 
       
   164 // with parser combinators (and other parsing algorithms)
       
   165 // no left-recursion is allowed, otherwise they will loop
       
   166 
       
   167 lazy val EL: Parser[String, Int] = 
       
   168   ((EL ~ p"+" ~ EL).map{ case ((x, y), z) => x + z} || 
       
   169    (EL ~ p"*" ~ EL).map{ case ((x, y), z) => x * z} ||
       
   170    (p"(" ~ EL ~ p")").map{ case ((x, y), z) => y} ||
       
   171    NumParserInt)
       
   172 
       
   173 // this will run forever:
       
   174 //println(EL.parse_all("1+2+3"))
       
   175 
       
   176 
       
   177 // non-ambiguous vs ambiguous grammars
       
   178 
       
   179 // ambiguous
       
   180 lazy val S : Parser[String, String] =
       
   181   (p"1" ~ S ~ S).map{ case ((x, y), z) => x + y + z } || p""
       
   182 
       
   183 //println(time(S.parse("1" * 10)))
       
   184 //println(time(S.parse_all("1" * 10)))
       
   185 
       
   186 // non-ambiguous
       
   187 lazy val U : Parser[String, String] =
       
   188   (p"1" ~ U).map{ case (x, y) => x + y } || p""
       
   189 
       
   190 //println(time(U.parse("1" * 10)))
       
   191 //println(time(U.parse_all("1" * 10)))
       
   192 println(U.parse("1" * 25))
       
   193 
       
   194 U.parse("11")
       
   195 U.parse("11111")
       
   196 U.parse("11011")
       
   197 
       
   198 U.parse_all("1" * 100)
       
   199 U.parse_all("1" * 100 + "0")
       
   200 
       
   201 // you can see the difference in second example
       
   202 //S.parse_all("1" * 100)         // succeeds
       
   203 //S.parse_all("1" * 100 + "0")   // fails
       
   204 
       
   205 
       
   206 // A variant which counts how many 1s are parsed
       
   207 lazy val UCount : Parser[String, Int] =
       
   208   (p"1" ~ UCount).map{ case (_, y) => y + 1 } || p"".map{ _ => 0 }
       
   209 
       
   210 println(UCount.parse("11111"))
       
   211 println(UCount.parse_all("11111"))
       
   212 
       
   213 // Two single character parsers
       
   214 lazy val One : Parser[String, String] = p"a"
       
   215 lazy val Two : Parser[String, String] = p"b"
       
   216 
       
   217 One.parse("a")
       
   218 One.parse("aaa")
       
   219 
       
   220 // note how the pairs nest to the left with sequence parsers
       
   221 (One ~ One).parse("aaa")
       
   222 (One ~ One ~ One).parse("aaa")
       
   223 (One ~ One ~ One ~ One).parse("aaaa")
       
   224 
       
   225 (One || Two).parse("aaa")
       
   226 
       
   227 
       
   228 
       
   229 // a problem with the arithmetic expression parser: it 
       
   230 // gets very slow with deeply nested parentheses
       
   231 
       
   232 println("Runtime problem")
       
   233 println(E.parse("1"))
       
   234 println(E.parse("(1)"))
       
   235 println(E.parse("((1))"))
       
   236 //println(E.parse("(((1)))"))
       
   237 //println(E.parse("((((1))))"))
       
   238 //println(E.parse("((((((1))))))"))
       
   239 //println(E.parse("(((((((1)))))))"))
       
   240 //println(E.parse("((((((((1)))))))"))
       
   241