# HG changeset patch # User Christian Urban # Date 1353334722 0 # Node ID 2d625418c011909fdac21ac3065fc3df78bb01c1 # Parent dff4b062a8a90cfd3ce4b5e4fe38fbe4fb007b09 added everything diff -r dff4b062a8a9 -r 2d625418c011 S_grammar-token.scala --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/S_grammar-token.scala Mon Nov 19 14:18:42 2012 +0000 @@ -0,0 +1,47 @@ +//:load matcher.scala +//:load parser3.scala + +abstract class Token +case object T_ONE extends Token + +val lexing_rules : List[Rule[Token]] = + List(("1", (s: List[Char]) => T_ONE)) + +val T = Tokenizer(lexing_rules) + +case class TokParser(tok: Token) extends Parser[List[Token], Token] { + def parse(ts: List[Token]) = ts match { + case t::ts if (t == tok) => Set((t, ts)) + case _ => Set () + } +} +implicit def token2tokparser(t: Token) = TokParser(t) + +case object EmpParser extends Parser[List[Token], String] { + def parse(ts: List[Token]) = Set(("", ts)) +} + + +lazy val Su: Parser[List[Token], String] = + (T_ONE ~ Su) ==> { case (x, y) => "1" + y} || EmpParser + + +def time_needed[T](i: Int, code: => T) = { + val start = System.nanoTime() + for (j <- 1 to i) code + val end = System.nanoTime() + (end - start)/(i * 1.0e9) +} + +def test(i: Int) = { + val result = Su.parse_all(T.fromString("1" * i)) + //print(result.size + " ") +} + + +for (i <- 1 to 1000 by 50) { + print(i + " ") + print("%.5f".format(time_needed(1, test(i)))) + print("\n") +} + diff -r dff4b062a8a9 -r 2d625418c011 S_grammar.scala --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/S_grammar.scala Mon Nov 19 14:18:42 2012 +0000 @@ -0,0 +1,52 @@ +//:load parser3.scala + +case class StringParser(s: String) extends Parser[String, String] { + def parse(ts: String) = { + if (s.length <= ts.length && ts.startsWith(s)) Set((s, ts.drop(s.length))) + else Set() + } +} + +implicit def string2parser(s: String) = StringParser(s) + +def time_needed[T](i: Int, code: => T) = { + val start = System.nanoTime() + for (j <- 1 to i) code + val end = System.nanoTime() + (end - start)/(i * 1.0e9) +} + +// unambiguous grammar + +lazy val U: Parser[String, String] = + ("1" ~ U) ==> { case (x, y) => "1" + y} || "" + +def test1(i: Int) = { + val result = U.parse_all("1" * i) + //print(result.size + " ") +} + +for (i <- 1 to 1000 by 50) { + print(i + " ") + print("%.5f".format(time_needed(1, test1(i)))) + print("\n") +} + + + +// ambiguous grammar +// n = 16 -> over 35 million parse trees + +lazy val S: Parser[String, String] = + ("1" ~ S ~ S) ==> { case ((x, y), z) => "1" + y + z} || "" + +def test2(i: Int) = { + val result = S.parse_all("1" * i) + print(result.size + " ") +} + +for (i <- 1 to 30) { + print(i + " ") + print("%.5f".format(time_needed(1, test2(i)))) + print("\n") +} diff -r dff4b062a8a9 -r 2d625418c011 Term_grammar.scala --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Term_grammar.scala Mon Nov 19 14:18:42 2012 +0000 @@ -0,0 +1,62 @@ +//:load matcher.scala +//:load parser3.scala + +// some regular expressions +val LETTER = RANGE("abcdefghijklmnopqrstuvwxyz") +val ID = PLUS(LETTER) + +val DIGIT = RANGE("0123456789") +val NONZERODIGIT = RANGE("123456789") +val NUMBER = ALT(SEQ(NONZERODIGIT, STAR(DIGIT)), "0") + +val LPAREN = CHAR('(') +val RPAREN = CHAR(')') + +val WHITESPACE = PLUS(RANGE(" \n")) +val OPS = RANGE("+-*") + +// for classifying the strings that have been lexed +abstract class Token + +case object T_WHITESPACE extends Token +case class T_NUM(s: String) extends Token +case class T_OP(s: String) extends Token +case object T_LPAREN extends Token +case object T_RPAREN extends Token + + +// lexing rules for arithmetic expressions +val lexing_rules: List[Rule[Token]]= + List((NUMBER, (s) => T_NUM(s.mkString)), + (WHITESPACE, (s) => T_WHITESPACE), + (LPAREN, (s) => T_LPAREN), + (RPAREN, (s) => T_RPAREN), + (OPS, (s) => T_OP(s.mkString))) + +val Tk = Tokenizer(lexing_rules, List(T_WHITESPACE)) + + +case class TokParser(tok: Token) extends Parser[List[Token], Token] { + def parse(ts: List[Token]) = ts match { + case t::ts if (t == tok) => Set((t, ts)) + case _ => Set () + } +} +implicit def token2tparser(t: Token) = TokParser(t) + +case object NumParser extends Parser[List[Token], Int] { + def parse(ts: List[Token]) = ts match { + case T_NUM(s)::ts => Set((s.toInt, ts)) + case _ => Set () + } +} + +lazy val E: Parser[List[Token], Int] = (T ~ T_OP("+") ~ E) ==> { case ((x, y), z) => x + z } || T +lazy val T: Parser[List[Token], Int] = (F ~ T_OP("*") ~ T) ==> { case ((x, y), z) => x * z } || F +lazy val F: Parser[List[Token], Int] = (T_LPAREN ~> E <~ T_RPAREN) || NumParser + +println(E.parse_all(Tk.fromString("1 + 2 + 3"))) +println(E.parse_all(Tk.fromString("1 + 2 * 3"))) +println(E.parse_all(Tk.fromString("(1 + 2) * 3"))) +println(E.parse_all(Tk.fromString("(14 + 2) * (3 + 2)"))) + diff -r dff4b062a8a9 -r 2d625418c011 crawler1.scala --- a/crawler1.scala Wed Nov 14 08:46:00 2012 +0000 +++ b/crawler1.scala Mon Nov 19 14:18:42 2012 +0000 @@ -42,3 +42,4 @@ // call on the command line crawl(startURL, 2) +crawl("""http://www.dcs.kcl.ac.uk/staff/urbanc/msc-projects-12.html""", 2) diff -r dff4b062a8a9 -r 2d625418c011 html-tokens.scala --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/html-tokens.scala Mon Nov 19 14:18:42 2012 +0000 @@ -0,0 +1,112 @@ +import Console._ + +// regular expressions including NOT +abstract class Rexp { + def ~ (right : Rexp) = SEQ(this, right) + def || (right : Rexp) = ALT(this, right) +} +case object NULL extends Rexp +case object EMPTY extends Rexp +case class CHAR(c: Char) extends Rexp +case class ALT(r1: Rexp, r2: Rexp) extends Rexp +case class SEQ(r1: Rexp, r2: Rexp) extends Rexp +case class STAR(r: Rexp) extends Rexp +case class NOT(r: Rexp) extends Rexp + +// some convenience for typing in regular expressions +def charlist2rexp(s : List[Char]) : Rexp = s match { + case Nil => EMPTY + case c::Nil => CHAR(c) + case c::s => SEQ(CHAR(c), charlist2rexp(s)) +} +implicit def string2rexp(s : String) : Rexp = charlist2rexp(s.toList) + + +// nullable function: tests whether the regular +// expression can recognise the empty string +def nullable (r: Rexp) : Boolean = r match { + case NULL => false + case EMPTY => true + case CHAR(_) => false + case ALT(r1, r2) => nullable(r1) || nullable(r2) + case SEQ(r1, r2) => nullable(r1) && nullable(r2) + case STAR(_) => true + case NOT(r) => !(nullable(r)) +} + +// tests whether a regular expression +// cannot recognise more +def no_more (r: Rexp) : Boolean = r match { + case NULL => true + case EMPTY => false + case CHAR(_) => false + case ALT(r1, r2) => no_more(r1) && no_more(r2) + case SEQ(r1, r2) => if (nullable(r1)) (no_more(r1) && no_more(r2)) else no_more(r1) + case STAR(_) => false + case NOT(r) => !(no_more(r)) +} + + +// derivative of a regular expression w.r.t. a character +def der (c: Char, r: Rexp) : Rexp = r match { + case NULL => NULL + case EMPTY => NULL + case CHAR(d) => if (c == d) EMPTY else NULL + case ALT(r1, r2) => ALT(der(c, r1), der(c, r2)) + case SEQ(r1, r2) => + if (nullable(r1)) ALT(SEQ(der(c, r1), r2), der(c, r2)) + else SEQ(der(c, r1), r2) + case STAR(r) => SEQ(der(c, r), STAR(r)) + case NOT(r) => NOT(der (c, r)) +} + +def error (s: String) = "Could not lex " + s + +def munch(r: Rexp, s: List[Char], t: List[Char]) : Option[(List[Char], List[Char])] = { + //println("Look at" + s); + s match { + case Nil if (nullable(r)) => Some(Nil, t) + case Nil => None + case c::s if (no_more(der (c, r)) && nullable(r)) => Some(c::s, t) + case c::s if (no_more(der (c, r))) => None + case c::s => munch(der (c, r), s, t ::: List(c)) + } +} + +def one_string (regs: List[Rexp], s: List[Char]) : Either[(List[Char], List[Char]), String] = { + val somes = regs.map { munch(_, s, Nil) } .flatten + if (somes == Nil) Right(error(s.mkString)) else Left(somes sortBy (_._1.length) head) +} + +def tokenize (regs: List[Rexp], s: List[Char]) : List[String] = s match { + case Nil => Nil + case _ => one_string(regs, s) match { + case Left((rest, s)) => { println("tokenized: " + s.mkString) ; s.mkString :: tokenize(regs, rest) } + case Right(msg) => { println(msg); sys.exit() } + } +} + + +// regular expression for specifying +// ranges of characters +def RANGE(s : List[Char]) : Rexp = s match { + case Nil => NULL + case c::Nil => CHAR(c) + case c::s => CHAR(c) || RANGE(s) +} + +//one or more +def PLUS(r: Rexp) = r ~ STAR(r) + + +//some regular expressions +val COLOUR = "BLACK" || "BLUE" || "CYAN" || "GREEN" || "MAGENTA" || "RED" || "WHITE" || "YELLOW" + +//BOLD || BLINK +//INVISIBLE +//RESET +//REVERSED +//UNDERLINED + +println(RED + BOLD + "test") +println(RESET) diff -r dff4b062a8a9 -r 2d625418c011 matcher.scala --- a/matcher.scala Wed Nov 14 08:46:00 2012 +0000 +++ b/matcher.scala Mon Nov 19 14:18:42 2012 +0000 @@ -11,15 +11,6 @@ case class NOT(r: Rexp) extends Rexp -// some convenience for typing in regular expressions -def charlist2rexp(s : List[Char]) : Rexp = s match { - case Nil => EMPTY - case c::Nil => CHAR(c) - case c::s => SEQ(CHAR(c), charlist2rexp(s)) -} -implicit def string2rexp(s : String) : Rexp = charlist2rexp(s.toList) - - // nullable function: tests whether the regular // expression can recognise the empty string def nullable (r: Rexp) : Boolean = r match { @@ -59,42 +50,74 @@ // regular expression for specifying // ranges of characters -def RANGE(s : List[Char]) : Rexp = s match { +def Range(s : List[Char]) : Rexp = s match { case Nil => NULL case c::Nil => CHAR(c) - case c::s => ALT(CHAR(c), RANGE(s)) + case c::s => ALT(CHAR(c), Range(s)) } +def RANGE(s: String) = Range(s.toList) + // one or more def PLUS(r: Rexp) = SEQ(r, STAR(r)) +// many alternatives +def Alts(rs: List[Rexp]) : Rexp = rs match { + case Nil => NULL + case r::Nil => r + case r::rs => ALT(r, Alts(rs)) +} +def ALTS(rs: Rexp*) = Alts(rs.toList) + +// repetitions +def Seqs(rs: List[Rexp]) : Rexp = rs match { + case Nil => NULL + case r::Nil => r + case r::rs => SEQ(r, Seqs(rs)) +} +def SEQS(rs: Rexp*) = Seqs(rs.toList) + +// some convenience for typing in regular expressions +def charlist2rexp(s : List[Char]) : Rexp = s match { + case Nil => EMPTY + case c::Nil => CHAR(c) + case c::s => SEQ(CHAR(c), charlist2rexp(s)) +} +implicit def string2rexp(s : String) : Rexp = charlist2rexp(s.toList) + type Rule[T] = (Rexp, List[Char] => T) -def error (s: String) = throw new IllegalArgumentException ("Cannot tokenize: " + s) +case class Tokenizer[T](rules: List[Rule[T]], excl: List[T] = Nil) { -def munch[T](r: Rexp, action: List[Char] => T, s: List[Char], t: List[Char]) : Option[(List[Char], T)] = - s match { - case Nil if (nullable(r)) => Some(Nil, action(t)) - case Nil => None - case c::s if (no_more(der (c, r)) && nullable(r)) => Some(c::s, action(t)) - case c::s if (no_more(der (c, r))) => None - case c::s => munch(der (c, r), action, s, t ::: List(c)) + def munch(r: Rexp, action: List[Char] => T, s: List[Char], t: List[Char]) : Option[(List[Char], T)] = + s match { + case Nil if (nullable(r)) => Some(Nil, action(t)) + case Nil => None + case c::s if (no_more(der (c, r)) && nullable(r)) => Some(c::s, action(t)) + case c::s if (no_more(der (c, r))) => None + case c::s => munch(der (c, r), action, s, t ::: List(c)) + } + + def one_token(s: List[Char]) : Either[(List[Char], T), String] = { + val somes = rules.map { (r) => munch(r._1, r._2, s, Nil) }.flatten + if (somes == Nil) Right(s.mkString) + else Left(somes sortBy (_._1.length) head) } -def one_token[T](rs: List[Rule[T]], s: List[Char]) : (List[Char], T) = { - val somes = rs.map { (r) => munch(r._1, r._2, s, Nil) } .flatten - if (somes == Nil) error(s.mkString) else (somes sortBy (_._1.length) head) + def tokenize(cs: List[Char]) : List[T] = cs match { + case Nil => Nil + case _ => one_token(cs) match { + case Left((rest, token)) => token :: tokenize(rest) + case Right(s) => { println("Cannot tokenize: \"" + s + "\""); Nil } + } + } + + def fromString(s: String) : List[T] = + tokenize(s.toList).filterNot(excl.contains(_)) + + def fromFile(name: String) : List[T] = + fromString(io.Source.fromFile(name).mkString) + } -def tokenize[T](rs: List[Rule[T]], s: List[Char]) : List[T] = s match { - case Nil => Nil - case _ => one_token(rs, s) match { - case (rest, token) => token :: tokenize(rs, rest) - } -} - - - - - diff -r dff4b062a8a9 -r 2d625418c011 parser2.scala --- a/parser2.scala Wed Nov 14 08:46:00 2012 +0000 +++ b/parser2.scala Mon Nov 19 14:18:42 2012 +0000 @@ -115,6 +115,7 @@ lazy val T: Parser = (F ~ T_OP("*") ~ T) || F lazy val F: Parser = (T_LPAREN ~ E ~ T_RPAREN) || NumParser +tokenizer(lexing_rules, "1 + 2 + 3") println(E.parse_all(tokenizer(lexing_rules, "1 + 2 + 3"))) def eval(t: ParseTree) : Int = t match { diff -r dff4b062a8a9 -r 2d625418c011 parser2a.scala --- a/parser2a.scala Wed Nov 14 08:46:00 2012 +0000 +++ b/parser2a.scala Mon Nov 19 14:18:42 2012 +0000 @@ -94,7 +94,7 @@ } } -lazy val E: Parser[Int] = (T ~ T_OP("+") ~ E) ==> { case ((x, y), z) => x + z } || T // start symbol +lazy val E: Parser[Int] = (T ~ T_OP("+") ~ E) ==> { case ((x, y), z) => x + z } || T lazy val T: Parser[Int] = (F ~ T_OP("*") ~ T) ==> { case ((x, y), z) => x * z } || F lazy val F: Parser[Int] = (T_LPAREN ~> E <~ T_RPAREN) || NumParser diff -r dff4b062a8a9 -r 2d625418c011 parser3.scala --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/parser3.scala Mon Nov 19 14:18:42 2012 +0000 @@ -0,0 +1,32 @@ + +// parser combinators with input type I and return type T + +abstract class Parser[I <% Seq[_], T] { + def parse(ts: I): Set[(T, I)] + + def parse_all(ts: I) : Set[T] = + for ((head, tail) <- parse(ts); if (tail.isEmpty)) yield head + + def || (right : => Parser[I, T]) : Parser[I, T] = new AltParser(this, right) + def ==>[S] (f: => T => S) : Parser [I, S] = new FunParser(this, f) + def ~[S] (right : => Parser[I, S]) : Parser[I, (T, S)] = new SeqParser(this, right) + def ~>[S] (right : => Parser[I, S]) : Parser[I, S] = this ~ right ==> (_._2) + def <~[S] (right : => Parser[I, S]) : Parser[I, T] = this ~ right ==> (_._1) +} + +class SeqParser[I <% Seq[_], T, S](p: => Parser[I, T], q: => Parser[I, S]) extends Parser[I, (T, S)] { + def parse(sb: I) = + for ((head1, tail1) <- p.parse(sb); + (head2, tail2) <- q.parse(tail1)) yield ((head1, head2), tail2) +} + +class AltParser[I <% Seq[_], T](p: => Parser[I, T], q: => Parser[I, T]) extends Parser[I, T] { + def parse(sb: I) = p.parse(sb) ++ q.parse(sb) +} + +class FunParser[I <% Seq[_], T, S](p: => Parser[I, T], f: T => S) extends Parser[I, S] { + def parse(sb: I) = + for ((head, tail) <- p.parse(sb)) yield (f(head), tail) +} + + diff -r dff4b062a8a9 -r 2d625418c011 parser4.scala --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/parser4.scala Mon Nov 19 14:18:42 2012 +0000 @@ -0,0 +1,111 @@ + +// parser combinators with input type I and return type T +// and memoisation + +case class SubString(s: String, l: Int, h: Int) { + def low = l + def high = h + def length = h - l + def substring(l: Int = l, h: Int = h) = s.slice(l, h) + def set(low: Int = l, high: Int = h) = SubString(s, low, high) + +} + +type Ctxt = List[(String, SubString)] + +abstract class Parser[T] { + + def parse(ts: SubString, ctxt: Ctxt): Set[(T, SubString)] + + def parse_all(s: String) : Set[T] = + for ((head, tail) <- parse(SubString(s, 0, s.length), Nil); if (tail.substring() == "")) yield head + + def || (right : => Parser[T]) : Parser[T] = new AltParser(this, right) + def ==>[S] (f: => T => S) : Parser [S] = new FunParser(this, f) + def ~[S] (right : => Parser[S]) : Parser[(T, S)] = new SeqParser(this, right) + def ~>[S] (right : => Parser[S]) : Parser[S] = this ~ right ==> (_._2) + def <~[S] (right : => Parser[S]) : Parser[T] = this ~ right ==> (_._1) +} + +class SeqParser[T, S](p: => Parser[T], q: => Parser[S]) extends Parser[(T, S)] { + def parse(sb: SubString, ctxt: Ctxt) = + for ((head1, tail1) <- p.parse(sb, ctxt); + (head2, tail2) <- q.parse(tail1, ctxt)) yield ((head1, head2), tail2) +} + +class AltParser[T](p: => Parser[T], q: => Parser[T]) extends Parser[T] { + def parse(sb: SubString, ctxt: Ctxt) = p.parse(sb, ctxt) ++ q.parse(sb, ctxt) +} + +class FunParser[T, S](p: => Parser[T], f: T => S) extends Parser[S] { + def parse(sb: SubString, ctxt: Ctxt) = + for ((head, tail) <- p.parse(sb, ctxt)) yield (f(head), tail) +} + +case class SubStringParser(s: String) extends Parser[SubString] { + val n = s.length + def parse(sb: SubString, ctxt: Ctxt) = { + if (n <= sb.length && sb.substring(sb.low, sb.low + n) == s) + Set((sb.set(high = sb.low + n), sb.set(low = sb.low + n))) + else Set() + } +} + +implicit def string2parser(s: String) = SubStringParser(s) ==> (_.substring()) + +class IgnLst[T](p: => Parser[T]) extends Parser[T] { + def parse(sb: SubString, ctxt: Ctxt) = { + if (sb.length == 0) Set() + else for ((head, tail) <- p.parse(sb.set(high = sb.high - 1), ctxt)) + yield (head, tail.set(high = tail.high + 1)) + } +} + +class CHECK[T](nt: String, p: => Parser[T]) extends Parser[T] { + def parse(sb: SubString, ctxt: Ctxt) = { + val should_trim = ctxt.contains (nt, sb) + if (should_trim && sb.length == 0) Set() + else if (should_trim) new IgnLst(p).parse(sb, (nt, sb)::ctxt) + else p.parse(sb, (nt, sb)::ctxt) + } +} + +lazy val E: Parser[Int] = + new CHECK("E", (E ~ "+" ~ E) ==> { case ((x, y), z) => x + z} || + (E ~ "*" ~ E) ==> { case ((x, y), z) => x * z} || + ("(" ~ E ~ ")") ==> { case ((x, y), z) => y} || + "0" ==> { (s) => 0 } || + "1" ==> { (s) => 1 } || + "2" ==> { (s) => 2 } || + "3" ==> { (s) => 3 }) + +println("foo " + E.parse_all("1+2*3")) + + +// ambiguous grammar + +lazy val S: Parser[String] = + new CHECK("S", ("1" ~ S ~ S) ==> { case ((x, y), z) => "1" + y + z} || "") + +lazy val S2: Parser[String] = + new CHECK("S2", (S2 ~ S2 ~ S2) ==> { case ((x, y), z) => x + y + z} || "1" || "") + +def test2(i: Int) = { + val result = E.parse_all("1" * i) + print(result.size + " " + result + " ") +} + +def time_needed[T](i: Int, code: => T) = { + val start = System.nanoTime() + for (j <- 1 to i) code + val end = System.nanoTime() + (end - start)/(i * 1.0e9) +} + + +for (i <- 1 to 10) { + print(i + " ") + print("%.5f".format(time_needed(1, test2(i)))) + print("\n") +} + diff -r dff4b062a8a9 -r 2d625418c011 parser5.scala --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/parser5.scala Mon Nov 19 14:18:42 2012 +0000 @@ -0,0 +1,113 @@ +val DIGIT = RANGE("0123456789") +val NONZERODIGIT = RANGE("123456789") + +val NUMBER = ALT(SEQ(NONZERODIGIT, STAR(DIGIT)), "0") +val LPAREN = CHAR('(') +val RPAREN = CHAR(')') +val WHITESPACE = PLUS(RANGE(" \n")) +val OPS = RANGE("+-*") + +// for classifying the strings that have been recognised +abstract class Token +case object T_WHITESPACE extends Token +case class T_NUM(s: String) extends Token +case class T_OP(s: String) extends Token +case object T_LPAREN extends Token +case object T_RPAREN extends Token + +val lexing_rules: List[Rule[Token]]= + List((NUMBER, (s) => T_NUM(s.mkString)), + (WHITESPACE, (s) => T_WHITESPACE), + (LPAREN, (s) => T_LPAREN), + (RPAREN, (s) => T_RPAREN), + (OPS, (s) => T_OP(s.mkString))) + +val Tk = Tokenizer(lexing_rules, List(T_WHITESPACE)) + + +// parser combinators with input type I and return type T +// and memoisation + +case class SubList[T](s: List[T], l: Int, h: Int) { + def low = l + def high = h + def length = h - l + def sublist(l: Int = l, h: Int = h) = s.slice(l, h) + def set(low: Int = l, high: Int = h) = SubList(s, low, high) +} + +type Ctxt[T] = List[(String, SubList[T])] + +abstract class Parser[I, T] { + + def parse(ts: SubList[I], ctxt: Ctxt[I]): Set[(T, SubList[I])] + + def parse_all(s: List[I]) : Set[T] = + for ((head, tail) <- parse(SubList(s, 0, s.length), Nil); if (tail.sublist() == Nil)) yield head + + def || (right : => Parser[I, T]) : Parser[I, T] = new AltParser(this, right) + def ==>[S] (f: => T => S) : Parser [I, S] = new FunParser(this, f) + def ~[S] (right : => Parser[I, S]) : Parser[I, (T, S)] = new SeqParser(this, right) + def ~>[S] (right : => Parser[I, S]) : Parser[I, S] = this ~ right ==> (_._2) + def <~[S] (right : => Parser[I, S]) : Parser[I, T] = this ~ right ==> (_._1) +} + +class SeqParser[I, T, S](p: => Parser[I, T], q: => Parser[I, S]) extends Parser[I, (T, S)] { + def parse(sb: SubList[I], ctxt: Ctxt[I]) = + for ((head1, tail1) <- p.parse(sb, ctxt); + (head2, tail2) <- q.parse(tail1, ctxt)) yield ((head1, head2), tail2) +} + +class AltParser[I, T](p: => Parser[I, T], q: => Parser[I, T]) extends Parser[I, T] { + def parse(sb: SubList[I], ctxt: Ctxt[I]) = p.parse(sb, ctxt) ++ q.parse(sb, ctxt) +} + +class FunParser[I, T, S](p: => Parser[I, T], f: T => S) extends Parser[I, S] { + def parse(sb: SubList[I], ctxt: Ctxt[I]) = + for ((head, tail) <- p.parse(sb, ctxt)) yield (f(head), tail) +} + +case object NumParser extends Parser[Token, Int] { + def parse(sb: SubList[Token], ctxt: Ctxt[Token]) = { + if (0 < sb.length) sb.sublist(sb.low, sb.low + 1) match { + case T_NUM(i)::Nil => Set((i.toInt, sb.set(low = sb.low + 1))) + case _ => Set() + } + else Set() + } +} + +case class TokParser(t: Token) extends Parser[Token, Token] { + def parse(sb: SubList[Token], ctxt: Ctxt[Token]) = { + if (0 < sb.length && sb.sublist(sb.low, sb.low + 1) == List(t)) Set((t, sb.set(low = sb.low + 1))) + else Set() + } +} + +implicit def token2tparser(t: Token) = TokParser(t) + +class IgnLst[I, T](p: => Parser[I, T]) extends Parser[I, T] { + def parse(sb: SubList[I], ctxt: Ctxt[I]) = { + if (sb.length == 0) Set() + else for ((head, tail) <- p.parse(sb.set(high = sb.high - 1), ctxt)) + yield (head, tail.set(high = tail.high + 1)) + } +} + +class CHECK[I, T](nt: String, p: => Parser[I, T]) extends Parser[I, T] { + def parse(sb: SubList[I], ctxt: Ctxt[I]) = { + val should_trim = ctxt.contains (nt, sb) + if (should_trim && sb.length == 0) Set() + else if (should_trim) new IgnLst(p).parse(sb, (nt, sb)::ctxt) + else p.parse(sb, (nt, sb)::ctxt) + } +} + +lazy val E: Parser[Token, Int] = + new CHECK("E", (E ~ T_OP("+") ~ E) ==> { case ((x, y), z) => x + z} || + (E ~ T_OP("*") ~ E) ==> { case ((x, y), z) => x * z} || + (T_LPAREN ~ E ~ T_RPAREN) ==> { case ((x, y), z) => y} || + NumParser) + +println(E.parse_all(Tk.fromString("1 + 2 * 3"))) +println(E.parse_all(Tk.fromString("(1 + 2) * 3"))) diff -r dff4b062a8a9 -r 2d625418c011 slides07.pdf Binary file slides07.pdf has changed diff -r dff4b062a8a9 -r 2d625418c011 slides07.tex --- a/slides07.tex Wed Nov 14 08:46:00 2012 +0000 +++ b/slides07.tex Mon Nov 19 14:18:42 2012 +0000 @@ -359,9 +359,9 @@ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \mode{ \begin{frame}[c] -\frametitle{\begin{tabular}{c}CFG\end{tabular}} +\frametitle{\begin{tabular}{c}CFGs\end{tabular}} -A \alert{context-free} grammar \bl{$G$} consists of +A \alert{context-free} grammar (CFG) \bl{$G$} consists of: \begin{itemize} \item a finite set of nonterminal symbols (upper case) diff -r dff4b062a8a9 -r 2d625418c011 test.html --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test.html Mon Nov 19 14:18:42 2012 +0000 @@ -0,0 +1,37 @@ +MSc Projects + +

+start of paragraph. a cyan word normal again something longer. +

+ + +

Description: + Regular expressions are extremely useful for many text-processing tasks such as finding patterns in texts, + lexing programs, syntax highlighting and so on. Given that regular expressions were + introduced in 1950 by Stephen Kleene, you might think + regular expressions have since been studied and implemented to death. But you would definitely be mistaken: in fact they are still + an active research area. For example + this paper + about regular expression matching and partial derivatives was presented this summer at the international + PPDP'12 conference. The task in this project is to implement the results from this paper.

+ +

The background for this project is that some regular expressions are + evil + and can stab you in the back; according to + this blog post. + For example, if you use in Python or + in Ruby (probably also in other mainstream programming languages) the + innocently looking regular expression a?{28}a{28} and match it, say, against the string + aaaaaaaaaaaaaaaaaaaaaaaaaaaa (that is 28 as), you will soon notice that your CPU usage goes to 100%. In fact, + Python and Ruby need approximately 30 seconds of hard work for matching this string. You can try it for yourself: + re.py (Python version) and + re.rb + (Ruby version). You can imagine an attacker + mounting a nice DoS attack against + your program if it contains such an evil regular expression. Actually + Scala (and also Java) are almost immune from such + attacks as they can deal with strings of up to 4,300 as in less than a second. But if you scale + the regular expression and string further to, say, 4,600 as, then you get a + StackOverflowError + potentially crashing your program. +