//:load matcher.scala// some regular expressionsval SYM = RANGE("""ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz.,!?-{[()]}':;%0123456789""")val WORD = PLUS(SYM)val BTAG = SEQS("<", WORD, ">") val ETAG = SEQS("</", WORD, ">")val WHITESPACE = PLUS(RANGE(" \n"))// for classifying the strings that have been recognisedabstract class Tokencase object T_WHITESPACE extends Tokencase class T_WORD(s: String) extends Tokencase class T_ETAG(s: String) extends Tokencase class T_BTAG(s: String) extends Tokencase class T_NT(s: String, rhs: List[Token]) extends Tokenval lexing_rules: List[Rule[Token]] = List((BTAG, (s) => T_BTAG(s.mkString)), (ETAG, (s) => T_ETAG(s.mkString)), (WORD, (s) => T_WORD(s.mkString)), (WHITESPACE, (s) => T_WHITESPACE))// the tokenizerval T = Tokenizer(lexing_rules)// width for printingval WIDTH = 60def interpret(ts: List[Token], c: Int, ctr: List[String]) : Unit= ts match { case Nil => println(Console.RESET) case T_WHITESPACE::rest => print(Console.RESET + " "); interpret(rest, c + 1, ctr) case T_WORD(s)::rest => { val newstr = Console.RESET + ctr.reverse.mkString + s if (c + s.length < WIDTH) { print(newstr); interpret(rest, c + s.length, ctr) } else { print("\n" + newstr) interpret(rest, s.length, ctr) } } case T_BTAG("<p>")::rest => print("\n"); interpret(rest, 0, ctr) case T_ETAG("</p>")::rest => print("\n"); interpret(rest, 0, ctr) case T_BTAG("<b>")::rest => interpret(rest, c, Console.BOLD :: ctr) case T_BTAG("<a>")::rest => interpret(rest, c, Console.UNDERLINED :: ctr) case T_BTAG("<cyan>")::rest => interpret(rest, c, Console.CYAN :: ctr) case T_BTAG("<red>")::rest => interpret(rest, c, Console.RED :: ctr) case T_BTAG("<blink>")::rest => interpret(rest, c, Console.BLINK :: ctr) case T_ETAG(_)::rest => interpret(rest, c, ctr.tail) case _::rest => interpret(rest, c, ctr)}val test_string = """<b>MSc Projects</b><p>start of paragraph. <cyan> a <red>cyan</red> word</cyan> normal again something longer.</p> <p><b>Description:</b> <a>Regular expressions</a> are extremely useful for many text-processing tasks such as finding patterns in texts, lexing programs, syntax highlighting and so on. Given that regular expressions were introduced in 1950 by <a>Stephen Kleene</a>, you might think regular expressions have since been studied and implemented to death. But you would definitely be mistaken: in fact they are still an active research area. For example <a>this paper</a> about regular expression matching and partial derivatives was presented this summer at the international PPDP'12 conference. The task in this project is to implement the results from this paper.</p> <p>The background for this project is that some regular expressions are <a>evil</a> and can stab you in the back; according to this <a>blog post</a>. For example, if you use in <a>Python</a> or in <a>Ruby</a> (probably also in other mainstream programming languages) the innocently looking regular expression a?{28}a{28} and match it, say, against the string <red>aaaaaaaaaa<cyan>aaaaaaa</cyan>aaaaaaaaaaa</red> (that is 28 as), you will soon notice that your CPU usage goes to 100%. In fact, Python and Ruby need approximately 30 seconds of hard work for matching this string. You can try it for yourself: <a>re.py</a> (Python version) and <a>re.rb</a> (Ruby version). You can imagine an attacker mounting a nice <a>DoS attack</a> against your program if it contains such an evil regular expression. Actually <a>Scala</a> (and also Java) are almost immune from such attacks as they can deal with strings of up to 4,300 as in less than a second. But if you scale the regular expression and string further to, say, 4,600 as, then you get a StackOverflowError potentially crashing your program. </p>"""interpret(T.fromString(test_string), 0, Nil)