updated
authorChristian Urban <christian dot urban at kcl dot ac dot uk>
Fri, 23 Nov 2012 14:08:31 +0000
changeset 71 7717f20f0504
parent 70 e6868bd2942b
child 72 d65525aeca08
updated
html.scala
parser.scala
parser1.scala
parser2.scala
parser2a.scala
parser4.scala
test.html
while.scala
--- a/html.scala	Wed Nov 21 09:04:11 2012 +0000
+++ b/html.scala	Fri Nov 23 14:08:31 2012 +0000
@@ -56,4 +56,44 @@
   case _::rest => interpret(rest, c, ctr)
 }
  
-interpret(T.fromFile("test.html"), 0, Nil)
+val test_string = """
+<b>MSc Projects</b>
+
+<p>
+start of paragraph. <cyan> a <red>cyan</red> word</cyan> normal again something longer.
+</p> 
+
+
+ <p><b>Description:</b>  
+  <a>Regular expressions</a> are extremely useful for many text-processing tasks such as finding patterns in texts,
+  lexing programs, syntax highlighting and so on. Given that regular expressions were
+  introduced in 1950 by <a>Stephen Kleene</a>, you might think 
+  regular expressions have since been studied and implemented to death. But you would definitely be mistaken: in fact they are still
+  an active research area. For example
+  <a>this paper</a> 
+  about regular expression matching and partial derivatives was presented this summer at the international 
+  PPDP'12 conference. The task in this project is to implement the results from this paper.</p>
+
+  <p>The background for this project is that some regular expressions are 
+  <a>evil</a>
+  and can stab you in the back; according to
+  this <a>blog post</a>.
+  For example, if you use in <a>Python</a> or 
+  in <a>Ruby</a> (probably also in other mainstream programming languages) the 
+  innocently looking regular expression a?{28}a{28} and match it, say, against the string 
+  <red>aaaaaaaaaa<cyan>aaaaaaa</cyan>aaaaaaaaaaa</red> (that is 28 as), you will soon notice that your CPU usage goes to 100%. In fact,
+  Python and Ruby need approximately 30 seconds of hard work for matching this string. You can try it for yourself:
+  <a>re.py</a> (Python version) and 
+  <a>re.rb</a> 
+  (Ruby version). You can imagine an attacker
+  mounting a nice <a>DoS attack</a> against 
+  your program if it contains such an evil regular expression. Actually 
+  <a>Scala</a> (and also Java) are almost immune from such
+  attacks as they can deal with strings of up to 4,300 as in less than a second. But if you scale
+  the regular expression and string further to, say, 4,600 as, then you get a 
+  StackOverflowError 
+  potentially crashing your program.
+  </p>
+"""
+
+interpret(T.fromString(test_string), 0, Nil)
--- a/parser.scala	Wed Nov 21 09:04:11 2012 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,90 +0,0 @@
-:load matcher.scala
-
-// some regular expressions
-val DIGIT = RANGE("0123456789".toList)
-val NONZERODIGIT = RANGE("123456789".toList)
-
-val NUMBER = ALT(SEQ(NONZERODIGIT, STAR(DIGIT)), "0")
-val LPAREN = CHAR('(')
-val RPAREN = CHAR(')')
-val WHITESPACE = PLUS(RANGE(" \n".toList))
-val OPS = RANGE("+-*".toList)
-
-// for classifying the strings that have been recognised
-
-abstract class Token
-case object T_WHITESPACE extends Token
-case object T_NUM extends Token
-case class T_OP(s: String) extends Token
-case object T_LPAREN extends Token
-case object T_RPAREN extends Token
-case class NT(s: String) extends Token
-
-
-def tokenizer(rs: List[Rule[Token]], s: String) : List[Token] = 
-  tokenize(rs, s.toList).filterNot(_ match {
-    case T_WHITESPACE => true
-    case _ => false
-  })
-
-// lexing rules for arithmetic expressions
-val lexing_rules: List[Rule[Token]]= 
-  List((NUMBER, (s) => T_NUM),
-       (WHITESPACE, (s) => T_WHITESPACE),
-       (LPAREN, (s) => T_LPAREN),
-       (RPAREN, (s) => T_RPAREN),
-       (OPS, (s) => T_OP(s.mkString)))
-
-
-type Grammar = List[(String, List[Token])]
-
-// grammar for arithmetic expressions
-val grammar = 
-  List ("F" -> List(T_NUM),
-        "E" -> List(T_NUM),
-        "E" -> List(NT("E"), T_OP("+"), NT("E")),
-        "E" -> List(NT("E"), T_OP("-"), NT("E")),
-        "E" -> List(NT("E"), T_OP("*"), NT("E")),    
-        "E" -> List(T_LPAREN, NT("E"), T_RPAREN))
-
-
-def chop[A](ts1: List[A], prefix: List[A], ts2: List[A]) : Option[(List[A], List[A])] = 
-  ts1 match {
-    case Nil => None
-    case t::ts => 
-      if (ts1.startsWith(prefix)) Some(ts2.reverse, ts1.drop(prefix.length))
-      else chop(ts, prefix, t::ts2)
-  }
-
-// examples
-chop(List(1,2,3,4,5,6,7,8,9), List(4,5), Nil)  
-chop(List(1,2,3,4,5,6,7,8,9), List(3,5), Nil)  
-
-def replace[A](ts: List[A], out: List[A], in: List [A]) = 
-  chop(ts, out, Nil) match {
-    case None => None
-    case Some((before, after)) => Some(before ::: in ::: after)
-  }  
-
-def parse(g: Grammar, ts: List[Token]) : Boolean = {
-  println(ts)
-  if (ts == List(NT("E"))) true
-  else {
-    val tss = for ((lhs, rhs) <- g) yield replace(ts, rhs, List(NT(lhs)))
-    tss.flatten.exists(parse(g, _))
-  }
-}
- 
-def parser(g: Grammar, rs: List[Rule[Token]], s: String) = {
-  println("\n")
-  parse(g, tokenizer(rs, s))
-}
-  
-
-
-parser(grammar, lexing_rules, "2 + 3 *    4 +       1")
-parser(grammar, lexing_rules, "(2 + 3) * (4 + 1)")
-parser(grammar, lexing_rules, "(2 + 3) * 4 (4 + 1)")
-
-
- 
--- a/parser1.scala	Wed Nov 21 09:04:11 2012 +0000
+++ b/parser1.scala	Fri Nov 23 14:08:31 2012 +0000
@@ -1,31 +1,27 @@
-:load matcher.scala
+// A naive bottom-up parser with backtracking
+//
+// Needs:
+//   :load matcher.scala
 
 // some regular expressions
-val DIGIT = RANGE("0123456789".toList)
-val NONZERODIGIT = RANGE("123456789".toList)
+val DIGIT = RANGE("0123456789")
+val NONZERODIGIT = RANGE("123456789")
 
 val NUMBER = ALT(SEQ(NONZERODIGIT, STAR(DIGIT)), "0")
 val LPAREN = CHAR('(')
 val RPAREN = CHAR(')')
-val WHITESPACE = PLUS(RANGE(" \n".toList))
-val OPS = RANGE("+-*".toList)
+val WHITESPACE = PLUS(RANGE(" \n"))
+val OPS = RANGE("+-*")
 
 // for classifying the strings that have been recognised
+
 abstract class Token
 case object T_WHITESPACE extends Token
 case object T_NUM extends Token
 case class T_OP(s: String) extends Token
 case object T_LPAREN extends Token
 case object T_RPAREN extends Token
-case class T_NT(s: String, rhs: List[Token]) extends Token
-
-def tokenizer(rs: List[Rule[Token]], s: String) : List[Token] = 
-  tokenize(rs, s.toList).filterNot(_ match {
-    case T_WHITESPACE => true
-    case _ => false
-  })
-
-
+case class NT(s: String) extends Token
 
 // lexing rules for arithmetic expressions
 val lexing_rules: List[Rule[Token]]= 
@@ -35,33 +31,30 @@
        (RPAREN, (s) => T_RPAREN),
        (OPS, (s) => T_OP(s.mkString)))
 
+// the tokenizer
+val Tok = Tokenizer(lexing_rules, List(T_WHITESPACE))
 
 type Grammar = List[(String, List[Token])]
 
 // grammar for arithmetic expressions
 val grammar = 
-  List ("E" -> List(T_NUM),
-        "E" -> List(T_NT("E", Nil), T_OP("+"), T_NT("E", Nil)),
-        "E" -> List(T_NT("E", Nil), T_OP("-"), T_NT("E", Nil)),
-        "E" -> List(T_NT("E", Nil), T_OP("*"), T_NT("E", Nil)),    
-        "E" -> List(T_LPAREN, T_NT("E", Nil), T_RPAREN))
+  List ("F" -> List(T_NUM),
+        "E" -> List(T_NUM),
+        "E" -> List(NT("E"), T_OP("+"), NT("E")),
+        "E" -> List(NT("E"), T_OP("-"), NT("E")),
+        "E" -> List(NT("E"), T_OP("*"), NT("E")),    
+        "E" -> List(T_LPAREN, NT("E"), T_RPAREN))
 
-def startsWith[A](ts1: List[A], ts2: List[A]) : Boolean = (ts1, ts2) match {
-  case (_, Nil) => true
-  case (T_NT(e, _)::ts1,T_NT(f, _)::ts2) => (e == f) && startsWith(ts1, ts2)
-  case (t1::ts1, t2::ts2) => (t1 == t2) && startsWith(ts1, ts2)
-  case _ => false
-}
 
 def chop[A](ts1: List[A], prefix: List[A], ts2: List[A]) : Option[(List[A], List[A])] = 
   ts1 match {
     case Nil => None
     case t::ts => 
-      if (startsWith(ts1, prefix)) Some(ts2.reverse, ts1.drop(prefix.length))
+      if (ts1.startsWith(prefix)) Some(ts2.reverse, ts1.drop(prefix.length))
       else chop(ts, prefix, t::ts2)
   }
 
-// examples
+// examples for chop 
 chop(List(1,2,3,4,5,6,7,8,9), List(4,5), Nil)  
 chop(List(1,2,3,4,5,6,7,8,9), List(3,5), Nil)  
 
@@ -71,18 +64,25 @@
     case Some((before, after)) => Some(before ::: in ::: after)
   }  
 
-def parse1(g: Grammar, ts: List[Token]) : Boolean = ts match {
-  case List(T_NT("E", tree)) => { println(tree); true }
-  case _ => {
-    val tss = for ((lhs, rhs) <- g) yield replace(ts, rhs, List(T_NT(lhs, rhs)))
-    tss.flatten.exists(parse1(g, _))
+def parse(g: Grammar, ts: List[Token]) : Boolean = {
+  println(ts)
+  if (ts == List(NT("E"))) true
+  else {
+    val tss = for ((lhs, rhs) <- g) yield replace(ts, rhs, List(NT(lhs)))
+    tss.flatten.exists(parse(g, _))
   }
 }
  
+def parser(g: Grammar, s: String) = {
+  println("\n")
+  parse(g, Tok.fromString(s))
+}
+  
 
-println() ; parse1(grammar, tokenizer(lexing_rules, "2 + 3 * 4 + 1"))
-println() ; parse1(grammar, tokenizer(lexing_rules, "(2 + 3) * (4 + 1)"))
-println() ; parse1(grammar, tokenizer(lexing_rules, "(2 + 3) * 4 (4 + 1)"))
+
+parser(grammar, "2 + 3 *    4 +       1")
+parser(grammar, "(2 + 3) * (4 + 1)")
+parser(grammar, "(2 + 3) * 4 (4 + 1)")
 
 
  
--- a/parser2.scala	Wed Nov 21 09:04:11 2012 +0000
+++ b/parser2.scala	Fri Nov 23 14:08:31 2012 +0000
@@ -1,18 +1,21 @@
-:load matcher.scala
+// A naive version of parser combinators producing parse trees
+//
+// Needs
+//   :load matcher.scala
 
 // some regular expressions
-val LETTER = RANGE("abcdefghijklmnopqrstuvwxyz".toList)
+val LETTER = RANGE("abcdefghijklmnopqrstuvwxyz")
 val ID = PLUS(LETTER)
 
-val DIGIT = RANGE("0123456789".toList)
-val NONZERODIGIT = RANGE("123456789".toList)
+val DIGIT = RANGE("0123456789")
+val NONZERODIGIT = RANGE("123456789")
 val NUMBER = ALT(SEQ(NONZERODIGIT, STAR(DIGIT)), "0")
 
 val LPAREN = CHAR('(')
 val RPAREN = CHAR(')')
 
-val WHITESPACE = PLUS(RANGE(" \n".toList))
-val OPS = RANGE("+-*".toList)
+val WHITESPACE = PLUS(RANGE(" \n"))
+val OPS = RANGE("+-*")
 
 // for classifying the strings that have been recognised
 abstract class Token
@@ -27,13 +30,6 @@
 case object T_THEN extends Token
 case object T_ELSE extends Token
 
-def tokenizer(rs: List[Rule[Token]], s: String) : List[Token] = 
-  tokenize(rs, s.toList).filterNot(_ match {
-    case T_WHITESPACE => true
-    case _ => false
-  })
-
-
 // lexing rules for arithmetic expressions
 val lexing_rules: List[Rule[Token]]= 
   List(("if", (s) => T_IF),
@@ -46,6 +42,8 @@
        (RPAREN, (s) => T_RPAREN),
        (OPS, (s) => T_OP(s.mkString)))
 
+val Tok = Tokenizer(lexing_rules, List(T_WHITESPACE))
+
 
 // parse trees
 abstract class ParseTree
@@ -115,8 +113,8 @@
 lazy val T: Parser = (F ~ T_OP("*") ~ T) || F
 lazy val F: Parser = (T_LPAREN ~ E ~ T_RPAREN) || NumParser
    
-tokenizer(lexing_rules, "1 + 2 + 3")
-println(E.parse_all(tokenizer(lexing_rules, "1 + 2 + 3")))
+println(Tok.fromString("1 + 2 + 3"))
+println(E.parse_all(Tok.fromString("1 + 2 + 3")))
 
 def eval(t: ParseTree) : Int = t match {
   case Leaf(T_NUM(n)) => n.toInt
@@ -125,17 +123,16 @@
   case Branch(List(Leaf(T_LPAREN), t, Leaf(T_RPAREN))) => eval(t) 
 }
 
-(E.parse_all(tokenizer(lexing_rules, "1 + 2 + 3"))).map(eval(_))
-(E.parse_all(tokenizer(lexing_rules, "1 + 2 * 3"))).map(eval(_))
-(E.parse_all(tokenizer(lexing_rules, "(1 + 2) * 3"))).map(eval(_))
+(E.parse_all(Tok.fromString("1 + 2 + 3"))).map(eval(_))
+(E.parse_all(Tok.fromString("1 + 2 * 3"))).map(eval(_))
 
 lazy val EXPR: Parser = 
   new ListParser(List(T_IF, EXPR, T_THEN, EXPR)) || 
   new ListParser(List(T_IF, EXPR, T_THEN, EXPR, T_ELSE, EXPR)) || 
   IdParser
  
-println(EXPR.parse_all(tokenizer(lexing_rules, "if a then b else c")))
-println(EXPR.parse_all(tokenizer(lexing_rules, "if a then if x then y else c")))
+println(EXPR.parse_all(Tok.fromString("if a then b else c")))
+println(EXPR.parse_all(Tok.fromString("if a then if x then y else c")))
 
 
 
--- a/parser2a.scala	Wed Nov 21 09:04:11 2012 +0000
+++ b/parser2a.scala	Fri Nov 23 14:08:31 2012 +0000
@@ -1,18 +1,22 @@
-:load matcher.scala
+// Parser combinators including semantic actions
+// parses lists of tokens
+//
+// Needs
+//    :load matcher.scala
 
 // some regular expressions
-val LETTER = RANGE("abcdefghijklmnopqrstuvwxyz".toList)
+val LETTER = RANGE("abcdefghijklmnopqrstuvwxyz")
 val ID = PLUS(LETTER)
 
-val DIGIT = RANGE("0123456789".toList)
-val NONZERODIGIT = RANGE("123456789".toList)
+val DIGIT = RANGE("0123456789")
+val NONZERODIGIT = RANGE("123456789")
 val NUMBER = ALT(SEQ(NONZERODIGIT, STAR(DIGIT)), "0")
 
 val LPAREN = CHAR('(')
 val RPAREN = CHAR(')')
 
-val WHITESPACE = PLUS(RANGE(" \n".toList))
-val OPS = RANGE("+-*".toList)
+val WHITESPACE = PLUS(RANGE(" \n"))
+val OPS = RANGE("+-*")
 
 // for classifying the strings that have been recognised
 abstract class Token
@@ -27,13 +31,6 @@
 case object T_THEN extends Token
 case object T_ELSE extends Token
 
-def tokenizer(rs: List[Rule[Token]], s: String) : List[Token] = 
-  tokenize(rs, s.toList).filterNot(_ match {
-    case T_WHITESPACE => true
-    case _ => false
-  })
-
-
 // lexing rules for arithmetic expressions
 val lexing_rules: List[Rule[Token]]= 
   List(("if", (s) => T_IF),
@@ -46,6 +43,7 @@
        (RPAREN, (s) => T_RPAREN),
        (OPS, (s) => T_OP(s.mkString)))
 
+val Tok = Tokenizer(lexing_rules, List(T_WHITESPACE))
 
 // parser combinators with return type T
 abstract class Parser[T] {
@@ -98,9 +96,10 @@
 lazy val T: Parser[Int] = (F ~ T_OP("*") ~ T) ==> { case ((x, y), z) => x * z } || F
 lazy val F: Parser[Int] = (T_LPAREN ~> E <~ T_RPAREN) || NumParser
    
-println(E.parse_all(tokenizer(lexing_rules, "1 + 2 + 3")))
-println(E.parse_all(tokenizer(lexing_rules, "1 + 2 * 3")))
-println(E.parse_all(tokenizer(lexing_rules, "(1 + 2) * 3")))
+println(E.parse_all(Tok.fromString("1 + 2 + 3")))
+println(E.parse_all(Tok.fromString("1 + 2 * 3")))
+println(E.parse_all(Tok.fromString("(1 + 2) * 3")))
 
-println(E.parse_all(tokenizer(lexing_rules, "(1 - 2) * 3")))
-println(E.parse_all(tokenizer(lexing_rules, "(1 + 2) * - 3")))
+// Excercise: implement minus 
+println(E.parse_all(Tok.fromString("(1 - 2) * 3")))
+println(E.parse_all(Tok.fromString("(1 + 2) * - 3")))
--- a/parser4.scala	Wed Nov 21 09:04:11 2012 +0000
+++ b/parser4.scala	Fri Nov 23 14:08:31 2012 +0000
@@ -77,6 +77,6 @@
                  "2" ==> { (s) => 2 } ||
                  "3" ==> { (s) => 3 })
 
-println(E.parse_all("1+2*3"))
+println(E.parse_all("1+2*3+3"))
 
 
--- a/test.html	Wed Nov 21 09:04:11 2012 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,37 +0,0 @@
-<b>MSc Projects</b>
-
-<p>
-start of paragraph. <cyan> a <red>cyan</red> word</cyan> normal again something longer.
-</p> 
-
-
- <p><b>Description:</b>  
-  <a>Regular expressions</a> are extremely useful for many text-processing tasks such as finding patterns in texts,
-  lexing programs, syntax highlighting and so on. Given that regular expressions were
-  introduced in 1950 by <a>Stephen Kleene</a>, you might think 
-  regular expressions have since been studied and implemented to death. But you would definitely be mistaken: in fact they are still
-  an active research area. For example
-  <a>this paper</a> 
-  about regular expression matching and partial derivatives was presented this summer at the international 
-  PPDP'12 conference. The task in this project is to implement the results from this paper.</p>
-
-  <p>The background for this project is that some regular expressions are 
-  <a>evil</a>
-  and can stab you in the back; according to
-  this <a>blog post</a>.
-  For example, if you use in <a>Python</a> or 
-  in <a>Ruby</a> (probably also in other mainstream programming languages) the 
-  innocently looking regular expression a?{28}a{28} and match it, say, against the string 
-  <red>aaaaaaaaaaaaaaaaaaaaaaaaaaaa</red> (that is 28 as), you will soon notice that your CPU usage goes to 100%. In fact,
-  Python and Ruby need approximately 30 seconds of hard work for matching this string. You can try it for yourself:
-  <a>re.py</a> (Python version) and 
-  <a>re.rb</a> 
-  (Ruby version). You can imagine an attacker
-  mounting a nice <a>DoS attack</a> against 
-  your program if it contains such an evil regular expression. Actually 
-  <a>Scala</a> (and also Java) are almost immune from such
-  attacks as they can deal with strings of up to 4,300 as in less than a second. But if you scale
-  the regular expression and string further to, say, 4,600 as, then you get a 
-  StackOverflowError 
-  potentially crashing your program.
-  </p>
--- a/while.scala	Wed Nov 21 09:04:11 2012 +0000
+++ b/while.scala	Fri Nov 23 14:08:31 2012 +0000
@@ -105,8 +105,9 @@
   (T_KWD("skip") ==> ((_) => Skip: Stmt)) ||
   (IdParser ~ T_OP(":=") ~ AExp) ==> { case ((x, y), z) => Assign(x, z): Stmt } ||
   (T_KWD("if") ~ BExp ~ T_KWD("then") ~ Block ~ T_KWD("else") ~ Block) ==>
-    { case (((((x,y),z),u),v),w) => If(y, u, w): Stmt }
-
+    { case (((((x,y),z),u),v),w) => If(y, u, w): Stmt } ||
+  (T_KWD("while") ~ BExp ~ T_KWD("do") ~ Block) ==> { case (((x, y), z), w) => While(y, w) } 
+ 
 lazy val Stmts: Parser[List[Token], Block] =
   (Stmt ~ T_SEMI ~ Stmts) ==> { case ((x, y), z) => x :: z : Block } ||
   (Stmt ==> ((s) => List(s) : Block))