+ −
//:load matcher.scala+ −
+ −
// some regular expressions+ −
val SYM = RANGE("""ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz.,!?-{[()]}':;%0123456789""")+ −
val WORD = PLUS(SYM)+ −
+ −
val BTAG = SEQS("<", WORD, ">") + −
val ETAG = SEQS("</", WORD, ">")+ −
+ −
val WHITESPACE = PLUS(RANGE(" \n"))+ −
+ −
// for classifying the strings that have been recognised+ −
abstract class Token+ −
case object T_WHITESPACE extends Token+ −
case class T_WORD(s: String) extends Token+ −
case class T_ETAG(s: String) extends Token+ −
case class T_BTAG(s: String) extends Token+ −
case class T_NT(s: String, rhs: List[Token]) extends Token+ −
+ −
val lexing_rules: List[Rule[Token]] = + −
List((BTAG, (s) => T_BTAG(s.mkString)),+ −
(ETAG, (s) => T_ETAG(s.mkString)),+ −
(WORD, (s) => T_WORD(s.mkString)),+ −
(WHITESPACE, (s) => T_WHITESPACE))+ −
+ −
// the tokenizer+ −
val T = Tokenizer(lexing_rules)+ −
+ −
// width for printing+ −
val WIDTH = 60+ −
+ −
+ −
def interpret(ts: List[Token], c: Int, ctr: List[String]) : Unit= ts match {+ −
case Nil => println(Console.RESET)+ −
case T_WHITESPACE::rest => print(Console.RESET + " "); interpret(rest, c + 1, ctr)+ −
case T_WORD(s)::rest => {+ −
val newstr = Console.RESET + ctr.reverse.mkString + s+ −
if (c + s.length < WIDTH) {+ −
print(newstr);+ −
interpret(rest, c + s.length, ctr)+ −
}+ −
else {+ −
print("\n" + newstr)+ −
interpret(rest, s.length, ctr)+ −
} + −
}+ −
case T_BTAG("<p>")::rest => print("\n"); interpret(rest, 0, ctr)+ −
case T_ETAG("</p>")::rest => print("\n"); interpret(rest, 0, ctr)+ −
case T_BTAG("<b>")::rest => interpret(rest, c, Console.BOLD :: ctr)+ −
case T_BTAG("<a>")::rest => interpret(rest, c, Console.UNDERLINED :: ctr)+ −
case T_BTAG("<cyan>")::rest => interpret(rest, c, Console.CYAN :: ctr)+ −
case T_BTAG("<red>")::rest => interpret(rest, c, Console.RED :: ctr)+ −
case T_BTAG("<blink>")::rest => interpret(rest, c, Console.BLINK :: ctr)+ −
case T_ETAG(_)::rest => interpret(rest, c, ctr.tail)+ −
case _::rest => interpret(rest, c, ctr)+ −
}+ −
+ −
val test_string = """+ −
<b>MSc Projects</b>+ −
+ −
<p>+ −
start of paragraph. <cyan> a <red>cyan</red> word</cyan> normal again something longer.+ −
</p> + −
+ −
+ −
<p><b>Description:</b> + −
<a>Regular expressions</a> are extremely useful for many text-processing tasks such as finding patterns in texts,+ −
lexing programs, syntax highlighting and so on. Given that regular expressions were+ −
introduced in 1950 by <a>Stephen Kleene</a>, you might think + −
regular expressions have since been studied and implemented to death. But you would definitely be mistaken: in fact they are still+ −
an active research area. For example+ −
<a>this paper</a> + −
about regular expression matching and partial derivatives was presented this summer at the international + −
PPDP'12 conference. The task in this project is to implement the results from this paper.</p>+ −
+ −
<p>The background for this project is that some regular expressions are + −
<a>evil</a>+ −
and can stab you in the back; according to+ −
this <a>blog post</a>.+ −
For example, if you use in <a>Python</a> or + −
in <a>Ruby</a> (probably also in other mainstream programming languages) the + −
innocently looking regular expression a?{28}a{28} and match it, say, against the string + −
<red>aaaaaaaaaa<cyan>aaaaaaa</cyan>aaaaaaaaaaa</red> (that is 28 as), you will soon notice that your CPU usage goes to 100%. In fact,+ −
Python and Ruby need approximately 30 seconds of hard work for matching this string. You can try it for yourself:+ −
<a>re.py</a> (Python version) and + −
<a>re.rb</a> + −
(Ruby version). You can imagine an attacker+ −
mounting a nice <a>DoS attack</a> against + −
your program if it contains such an evil regular expression. Actually + −
<a>Scala</a> (and also Java) are almost immune from such+ −
attacks as they can deal with strings of up to 4,300 as in less than a second. But if you scale+ −
the regular expression and string further to, say, 4,600 as, then you get a + −
StackOverflowError + −
potentially crashing your program.+ −
</p>+ −
"""+ −
+ −
interpret(T.fromString(test_string), 0, Nil)+ −