--- a/html.scala Wed Nov 21 09:04:11 2012 +0000
+++ b/html.scala Fri Nov 23 14:08:31 2012 +0000
@@ -56,4 +56,44 @@
case _::rest => interpret(rest, c, ctr)
}
-interpret(T.fromFile("test.html"), 0, Nil)
+val test_string = """
+<b>MSc Projects</b>
+
+<p>
+start of paragraph. <cyan> a <red>cyan</red> word</cyan> normal again something longer.
+</p>
+
+
+ <p><b>Description:</b>
+ <a>Regular expressions</a> are extremely useful for many text-processing tasks such as finding patterns in texts,
+ lexing programs, syntax highlighting and so on. Given that regular expressions were
+ introduced in 1950 by <a>Stephen Kleene</a>, you might think
+ regular expressions have since been studied and implemented to death. But you would definitely be mistaken: in fact they are still
+ an active research area. For example
+ <a>this paper</a>
+ about regular expression matching and partial derivatives was presented this summer at the international
+ PPDP'12 conference. The task in this project is to implement the results from this paper.</p>
+
+ <p>The background for this project is that some regular expressions are
+ <a>evil</a>
+ and can stab you in the back; according to
+ this <a>blog post</a>.
+ For example, if you use in <a>Python</a> or
+ in <a>Ruby</a> (probably also in other mainstream programming languages) the
+ innocently looking regular expression a?{28}a{28} and match it, say, against the string
+ <red>aaaaaaaaaa<cyan>aaaaaaa</cyan>aaaaaaaaaaa</red> (that is 28 as), you will soon notice that your CPU usage goes to 100%. In fact,
+ Python and Ruby need approximately 30 seconds of hard work for matching this string. You can try it for yourself:
+ <a>re.py</a> (Python version) and
+ <a>re.rb</a>
+ (Ruby version). You can imagine an attacker
+ mounting a nice <a>DoS attack</a> against
+ your program if it contains such an evil regular expression. Actually
+ <a>Scala</a> (and also Java) are almost immune from such
+ attacks as they can deal with strings of up to 4,300 as in less than a second. But if you scale
+ the regular expression and string further to, say, 4,600 as, then you get a
+ StackOverflowError
+ potentially crashing your program.
+ </p>
+"""
+
+interpret(T.fromString(test_string), 0, Nil)
--- a/parser.scala Wed Nov 21 09:04:11 2012 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,90 +0,0 @@
-:load matcher.scala
-
-// some regular expressions
-val DIGIT = RANGE("0123456789".toList)
-val NONZERODIGIT = RANGE("123456789".toList)
-
-val NUMBER = ALT(SEQ(NONZERODIGIT, STAR(DIGIT)), "0")
-val LPAREN = CHAR('(')
-val RPAREN = CHAR(')')
-val WHITESPACE = PLUS(RANGE(" \n".toList))
-val OPS = RANGE("+-*".toList)
-
-// for classifying the strings that have been recognised
-
-abstract class Token
-case object T_WHITESPACE extends Token
-case object T_NUM extends Token
-case class T_OP(s: String) extends Token
-case object T_LPAREN extends Token
-case object T_RPAREN extends Token
-case class NT(s: String) extends Token
-
-
-def tokenizer(rs: List[Rule[Token]], s: String) : List[Token] =
- tokenize(rs, s.toList).filterNot(_ match {
- case T_WHITESPACE => true
- case _ => false
- })
-
-// lexing rules for arithmetic expressions
-val lexing_rules: List[Rule[Token]]=
- List((NUMBER, (s) => T_NUM),
- (WHITESPACE, (s) => T_WHITESPACE),
- (LPAREN, (s) => T_LPAREN),
- (RPAREN, (s) => T_RPAREN),
- (OPS, (s) => T_OP(s.mkString)))
-
-
-type Grammar = List[(String, List[Token])]
-
-// grammar for arithmetic expressions
-val grammar =
- List ("F" -> List(T_NUM),
- "E" -> List(T_NUM),
- "E" -> List(NT("E"), T_OP("+"), NT("E")),
- "E" -> List(NT("E"), T_OP("-"), NT("E")),
- "E" -> List(NT("E"), T_OP("*"), NT("E")),
- "E" -> List(T_LPAREN, NT("E"), T_RPAREN))
-
-
-def chop[A](ts1: List[A], prefix: List[A], ts2: List[A]) : Option[(List[A], List[A])] =
- ts1 match {
- case Nil => None
- case t::ts =>
- if (ts1.startsWith(prefix)) Some(ts2.reverse, ts1.drop(prefix.length))
- else chop(ts, prefix, t::ts2)
- }
-
-// examples
-chop(List(1,2,3,4,5,6,7,8,9), List(4,5), Nil)
-chop(List(1,2,3,4,5,6,7,8,9), List(3,5), Nil)
-
-def replace[A](ts: List[A], out: List[A], in: List [A]) =
- chop(ts, out, Nil) match {
- case None => None
- case Some((before, after)) => Some(before ::: in ::: after)
- }
-
-def parse(g: Grammar, ts: List[Token]) : Boolean = {
- println(ts)
- if (ts == List(NT("E"))) true
- else {
- val tss = for ((lhs, rhs) <- g) yield replace(ts, rhs, List(NT(lhs)))
- tss.flatten.exists(parse(g, _))
- }
-}
-
-def parser(g: Grammar, rs: List[Rule[Token]], s: String) = {
- println("\n")
- parse(g, tokenizer(rs, s))
-}
-
-
-
-parser(grammar, lexing_rules, "2 + 3 * 4 + 1")
-parser(grammar, lexing_rules, "(2 + 3) * (4 + 1)")
-parser(grammar, lexing_rules, "(2 + 3) * 4 (4 + 1)")
-
-
-
--- a/parser1.scala Wed Nov 21 09:04:11 2012 +0000
+++ b/parser1.scala Fri Nov 23 14:08:31 2012 +0000
@@ -1,31 +1,27 @@
-:load matcher.scala
+// A naive bottom-up parser with backtracking
+//
+// Needs:
+// :load matcher.scala
// some regular expressions
-val DIGIT = RANGE("0123456789".toList)
-val NONZERODIGIT = RANGE("123456789".toList)
+val DIGIT = RANGE("0123456789")
+val NONZERODIGIT = RANGE("123456789")
val NUMBER = ALT(SEQ(NONZERODIGIT, STAR(DIGIT)), "0")
val LPAREN = CHAR('(')
val RPAREN = CHAR(')')
-val WHITESPACE = PLUS(RANGE(" \n".toList))
-val OPS = RANGE("+-*".toList)
+val WHITESPACE = PLUS(RANGE(" \n"))
+val OPS = RANGE("+-*")
// for classifying the strings that have been recognised
+
abstract class Token
case object T_WHITESPACE extends Token
case object T_NUM extends Token
case class T_OP(s: String) extends Token
case object T_LPAREN extends Token
case object T_RPAREN extends Token
-case class T_NT(s: String, rhs: List[Token]) extends Token
-
-def tokenizer(rs: List[Rule[Token]], s: String) : List[Token] =
- tokenize(rs, s.toList).filterNot(_ match {
- case T_WHITESPACE => true
- case _ => false
- })
-
-
+case class NT(s: String) extends Token
// lexing rules for arithmetic expressions
val lexing_rules: List[Rule[Token]]=
@@ -35,33 +31,30 @@
(RPAREN, (s) => T_RPAREN),
(OPS, (s) => T_OP(s.mkString)))
+// the tokenizer
+val Tok = Tokenizer(lexing_rules, List(T_WHITESPACE))
type Grammar = List[(String, List[Token])]
// grammar for arithmetic expressions
val grammar =
- List ("E" -> List(T_NUM),
- "E" -> List(T_NT("E", Nil), T_OP("+"), T_NT("E", Nil)),
- "E" -> List(T_NT("E", Nil), T_OP("-"), T_NT("E", Nil)),
- "E" -> List(T_NT("E", Nil), T_OP("*"), T_NT("E", Nil)),
- "E" -> List(T_LPAREN, T_NT("E", Nil), T_RPAREN))
+ List ("F" -> List(T_NUM),
+ "E" -> List(T_NUM),
+ "E" -> List(NT("E"), T_OP("+"), NT("E")),
+ "E" -> List(NT("E"), T_OP("-"), NT("E")),
+ "E" -> List(NT("E"), T_OP("*"), NT("E")),
+ "E" -> List(T_LPAREN, NT("E"), T_RPAREN))
-def startsWith[A](ts1: List[A], ts2: List[A]) : Boolean = (ts1, ts2) match {
- case (_, Nil) => true
- case (T_NT(e, _)::ts1,T_NT(f, _)::ts2) => (e == f) && startsWith(ts1, ts2)
- case (t1::ts1, t2::ts2) => (t1 == t2) && startsWith(ts1, ts2)
- case _ => false
-}
def chop[A](ts1: List[A], prefix: List[A], ts2: List[A]) : Option[(List[A], List[A])] =
ts1 match {
case Nil => None
case t::ts =>
- if (startsWith(ts1, prefix)) Some(ts2.reverse, ts1.drop(prefix.length))
+ if (ts1.startsWith(prefix)) Some(ts2.reverse, ts1.drop(prefix.length))
else chop(ts, prefix, t::ts2)
}
-// examples
+// examples for chop
chop(List(1,2,3,4,5,6,7,8,9), List(4,5), Nil)
chop(List(1,2,3,4,5,6,7,8,9), List(3,5), Nil)
@@ -71,18 +64,25 @@
case Some((before, after)) => Some(before ::: in ::: after)
}
-def parse1(g: Grammar, ts: List[Token]) : Boolean = ts match {
- case List(T_NT("E", tree)) => { println(tree); true }
- case _ => {
- val tss = for ((lhs, rhs) <- g) yield replace(ts, rhs, List(T_NT(lhs, rhs)))
- tss.flatten.exists(parse1(g, _))
+def parse(g: Grammar, ts: List[Token]) : Boolean = {
+ println(ts)
+ if (ts == List(NT("E"))) true
+ else {
+ val tss = for ((lhs, rhs) <- g) yield replace(ts, rhs, List(NT(lhs)))
+ tss.flatten.exists(parse(g, _))
}
}
+def parser(g: Grammar, s: String) = {
+ println("\n")
+ parse(g, Tok.fromString(s))
+}
+
-println() ; parse1(grammar, tokenizer(lexing_rules, "2 + 3 * 4 + 1"))
-println() ; parse1(grammar, tokenizer(lexing_rules, "(2 + 3) * (4 + 1)"))
-println() ; parse1(grammar, tokenizer(lexing_rules, "(2 + 3) * 4 (4 + 1)"))
+
+parser(grammar, "2 + 3 * 4 + 1")
+parser(grammar, "(2 + 3) * (4 + 1)")
+parser(grammar, "(2 + 3) * 4 (4 + 1)")
--- a/parser2.scala Wed Nov 21 09:04:11 2012 +0000
+++ b/parser2.scala Fri Nov 23 14:08:31 2012 +0000
@@ -1,18 +1,21 @@
-:load matcher.scala
+// A naive version of parser combinators producing parse trees
+//
+// Needs
+// :load matcher.scala
// some regular expressions
-val LETTER = RANGE("abcdefghijklmnopqrstuvwxyz".toList)
+val LETTER = RANGE("abcdefghijklmnopqrstuvwxyz")
val ID = PLUS(LETTER)
-val DIGIT = RANGE("0123456789".toList)
-val NONZERODIGIT = RANGE("123456789".toList)
+val DIGIT = RANGE("0123456789")
+val NONZERODIGIT = RANGE("123456789")
val NUMBER = ALT(SEQ(NONZERODIGIT, STAR(DIGIT)), "0")
val LPAREN = CHAR('(')
val RPAREN = CHAR(')')
-val WHITESPACE = PLUS(RANGE(" \n".toList))
-val OPS = RANGE("+-*".toList)
+val WHITESPACE = PLUS(RANGE(" \n"))
+val OPS = RANGE("+-*")
// for classifying the strings that have been recognised
abstract class Token
@@ -27,13 +30,6 @@
case object T_THEN extends Token
case object T_ELSE extends Token
-def tokenizer(rs: List[Rule[Token]], s: String) : List[Token] =
- tokenize(rs, s.toList).filterNot(_ match {
- case T_WHITESPACE => true
- case _ => false
- })
-
-
// lexing rules for arithmetic expressions
val lexing_rules: List[Rule[Token]]=
List(("if", (s) => T_IF),
@@ -46,6 +42,8 @@
(RPAREN, (s) => T_RPAREN),
(OPS, (s) => T_OP(s.mkString)))
+val Tok = Tokenizer(lexing_rules, List(T_WHITESPACE))
+
// parse trees
abstract class ParseTree
@@ -115,8 +113,8 @@
lazy val T: Parser = (F ~ T_OP("*") ~ T) || F
lazy val F: Parser = (T_LPAREN ~ E ~ T_RPAREN) || NumParser
-tokenizer(lexing_rules, "1 + 2 + 3")
-println(E.parse_all(tokenizer(lexing_rules, "1 + 2 + 3")))
+println(Tok.fromString("1 + 2 + 3"))
+println(E.parse_all(Tok.fromString("1 + 2 + 3")))
def eval(t: ParseTree) : Int = t match {
case Leaf(T_NUM(n)) => n.toInt
@@ -125,17 +123,16 @@
case Branch(List(Leaf(T_LPAREN), t, Leaf(T_RPAREN))) => eval(t)
}
-(E.parse_all(tokenizer(lexing_rules, "1 + 2 + 3"))).map(eval(_))
-(E.parse_all(tokenizer(lexing_rules, "1 + 2 * 3"))).map(eval(_))
-(E.parse_all(tokenizer(lexing_rules, "(1 + 2) * 3"))).map(eval(_))
+(E.parse_all(Tok.fromString("1 + 2 + 3"))).map(eval(_))
+(E.parse_all(Tok.fromString("1 + 2 * 3"))).map(eval(_))
lazy val EXPR: Parser =
new ListParser(List(T_IF, EXPR, T_THEN, EXPR)) ||
new ListParser(List(T_IF, EXPR, T_THEN, EXPR, T_ELSE, EXPR)) ||
IdParser
-println(EXPR.parse_all(tokenizer(lexing_rules, "if a then b else c")))
-println(EXPR.parse_all(tokenizer(lexing_rules, "if a then if x then y else c")))
+println(EXPR.parse_all(Tok.fromString("if a then b else c")))
+println(EXPR.parse_all(Tok.fromString("if a then if x then y else c")))
--- a/parser2a.scala Wed Nov 21 09:04:11 2012 +0000
+++ b/parser2a.scala Fri Nov 23 14:08:31 2012 +0000
@@ -1,18 +1,22 @@
-:load matcher.scala
+// Parser combinators including semantic actions
+// parses lists of tokens
+//
+// Needs
+// :load matcher.scala
// some regular expressions
-val LETTER = RANGE("abcdefghijklmnopqrstuvwxyz".toList)
+val LETTER = RANGE("abcdefghijklmnopqrstuvwxyz")
val ID = PLUS(LETTER)
-val DIGIT = RANGE("0123456789".toList)
-val NONZERODIGIT = RANGE("123456789".toList)
+val DIGIT = RANGE("0123456789")
+val NONZERODIGIT = RANGE("123456789")
val NUMBER = ALT(SEQ(NONZERODIGIT, STAR(DIGIT)), "0")
val LPAREN = CHAR('(')
val RPAREN = CHAR(')')
-val WHITESPACE = PLUS(RANGE(" \n".toList))
-val OPS = RANGE("+-*".toList)
+val WHITESPACE = PLUS(RANGE(" \n"))
+val OPS = RANGE("+-*")
// for classifying the strings that have been recognised
abstract class Token
@@ -27,13 +31,6 @@
case object T_THEN extends Token
case object T_ELSE extends Token
-def tokenizer(rs: List[Rule[Token]], s: String) : List[Token] =
- tokenize(rs, s.toList).filterNot(_ match {
- case T_WHITESPACE => true
- case _ => false
- })
-
-
// lexing rules for arithmetic expressions
val lexing_rules: List[Rule[Token]]=
List(("if", (s) => T_IF),
@@ -46,6 +43,7 @@
(RPAREN, (s) => T_RPAREN),
(OPS, (s) => T_OP(s.mkString)))
+val Tok = Tokenizer(lexing_rules, List(T_WHITESPACE))
// parser combinators with return type T
abstract class Parser[T] {
@@ -98,9 +96,10 @@
lazy val T: Parser[Int] = (F ~ T_OP("*") ~ T) ==> { case ((x, y), z) => x * z } || F
lazy val F: Parser[Int] = (T_LPAREN ~> E <~ T_RPAREN) || NumParser
-println(E.parse_all(tokenizer(lexing_rules, "1 + 2 + 3")))
-println(E.parse_all(tokenizer(lexing_rules, "1 + 2 * 3")))
-println(E.parse_all(tokenizer(lexing_rules, "(1 + 2) * 3")))
+println(E.parse_all(Tok.fromString("1 + 2 + 3")))
+println(E.parse_all(Tok.fromString("1 + 2 * 3")))
+println(E.parse_all(Tok.fromString("(1 + 2) * 3")))
-println(E.parse_all(tokenizer(lexing_rules, "(1 - 2) * 3")))
-println(E.parse_all(tokenizer(lexing_rules, "(1 + 2) * - 3")))
+// Excercise: implement minus
+println(E.parse_all(Tok.fromString("(1 - 2) * 3")))
+println(E.parse_all(Tok.fromString("(1 + 2) * - 3")))
--- a/parser4.scala Wed Nov 21 09:04:11 2012 +0000
+++ b/parser4.scala Fri Nov 23 14:08:31 2012 +0000
@@ -77,6 +77,6 @@
"2" ==> { (s) => 2 } ||
"3" ==> { (s) => 3 })
-println(E.parse_all("1+2*3"))
+println(E.parse_all("1+2*3+3"))
--- a/test.html Wed Nov 21 09:04:11 2012 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,37 +0,0 @@
-<b>MSc Projects</b>
-
-<p>
-start of paragraph. <cyan> a <red>cyan</red> word</cyan> normal again something longer.
-</p>
-
-
- <p><b>Description:</b>
- <a>Regular expressions</a> are extremely useful for many text-processing tasks such as finding patterns in texts,
- lexing programs, syntax highlighting and so on. Given that regular expressions were
- introduced in 1950 by <a>Stephen Kleene</a>, you might think
- regular expressions have since been studied and implemented to death. But you would definitely be mistaken: in fact they are still
- an active research area. For example
- <a>this paper</a>
- about regular expression matching and partial derivatives was presented this summer at the international
- PPDP'12 conference. The task in this project is to implement the results from this paper.</p>
-
- <p>The background for this project is that some regular expressions are
- <a>evil</a>
- and can stab you in the back; according to
- this <a>blog post</a>.
- For example, if you use in <a>Python</a> or
- in <a>Ruby</a> (probably also in other mainstream programming languages) the
- innocently looking regular expression a?{28}a{28} and match it, say, against the string
- <red>aaaaaaaaaaaaaaaaaaaaaaaaaaaa</red> (that is 28 as), you will soon notice that your CPU usage goes to 100%. In fact,
- Python and Ruby need approximately 30 seconds of hard work for matching this string. You can try it for yourself:
- <a>re.py</a> (Python version) and
- <a>re.rb</a>
- (Ruby version). You can imagine an attacker
- mounting a nice <a>DoS attack</a> against
- your program if it contains such an evil regular expression. Actually
- <a>Scala</a> (and also Java) are almost immune from such
- attacks as they can deal with strings of up to 4,300 as in less than a second. But if you scale
- the regular expression and string further to, say, 4,600 as, then you get a
- StackOverflowError
- potentially crashing your program.
- </p>
--- a/while.scala Wed Nov 21 09:04:11 2012 +0000
+++ b/while.scala Fri Nov 23 14:08:31 2012 +0000
@@ -105,8 +105,9 @@
(T_KWD("skip") ==> ((_) => Skip: Stmt)) ||
(IdParser ~ T_OP(":=") ~ AExp) ==> { case ((x, y), z) => Assign(x, z): Stmt } ||
(T_KWD("if") ~ BExp ~ T_KWD("then") ~ Block ~ T_KWD("else") ~ Block) ==>
- { case (((((x,y),z),u),v),w) => If(y, u, w): Stmt }
-
+ { case (((((x,y),z),u),v),w) => If(y, u, w): Stmt } ||
+ (T_KWD("while") ~ BExp ~ T_KWD("do") ~ Block) ==> { case (((x, y), z), w) => While(y, w) }
+
lazy val Stmts: Parser[List[Token], Block] =
(Stmt ~ T_SEMI ~ Stmts) ==> { case ((x, y), z) => x :: z : Block } ||
(Stmt ==> ((s) => List(s) : Block))