| author | Christian Urban <christian.urban@kcl.ac.uk> | 
| Fri, 26 Sep 2025 23:10:52 +0100 | |
| changeset 991 | 5d01eccc2036 | 
| parent 93 | 4794759139ea | 
| permissions | -rw-r--r-- | 
| 66 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 1 | |
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 2 | //:load matcher.scala | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 3 | |
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 4 | // some regular expressions | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 5 | val SYM = RANGE("""ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz.,!?-{[()]}':;%0123456789""")
 | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 6 | val WORD = PLUS(SYM) | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 7 | |
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 8 | val BTAG = SEQS("<", WORD, ">") 
 | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 9 | val ETAG = SEQS("</", WORD, ">")
 | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 10 | |
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 11 | val WHITESPACE = PLUS(RANGE(" \n"))
 | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 12 | |
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 13 | // for classifying the strings that have been recognised | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 14 | abstract class Token | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 15 | case object T_WHITESPACE extends Token | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 16 | case class T_WORD(s: String) extends Token | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 17 | case class T_ETAG(s: String) extends Token | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 18 | case class T_BTAG(s: String) extends Token | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 19 | case class T_NT(s: String, rhs: List[Token]) extends Token | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 20 | |
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 21 | val lexing_rules: List[Rule[Token]] = | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 22 | List((BTAG, (s) => T_BTAG(s.mkString)), | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 23 | (ETAG, (s) => T_ETAG(s.mkString)), | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 24 | (WORD, (s) => T_WORD(s.mkString)), | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 25 | (WHITESPACE, (s) => T_WHITESPACE)) | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 26 | |
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 27 | // the tokenizer | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 28 | val T = Tokenizer(lexing_rules) | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 29 | |
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 30 | // width for printing | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 31 | val WIDTH = 60 | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 32 | |
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 33 | |
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 34 | def interpret(ts: List[Token], c: Int, ctr: List[String]) : Unit= ts match {
 | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 35 | case Nil => println(Console.RESET) | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 36 | case T_WHITESPACE::rest => print(Console.RESET + " "); interpret(rest, c + 1, ctr) | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 37 |   case T_WORD(s)::rest => {
 | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 38 | val newstr = Console.RESET + ctr.reverse.mkString + s | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 39 |     if (c + s.length < WIDTH) {
 | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 40 | print(newstr); | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 41 | interpret(rest, c + s.length, ctr) | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 42 | } | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 43 |     else {
 | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 44 |       print("\n" + newstr)
 | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 45 | interpret(rest, s.length, ctr) | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 46 | } | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 47 | } | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 48 |   case T_BTAG("<p>")::rest => print("\n"); interpret(rest, 0, ctr)
 | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 49 |   case T_ETAG("</p>")::rest => print("\n"); interpret(rest, 0, ctr)
 | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 50 |   case T_BTAG("<b>")::rest => interpret(rest, c, Console.BOLD :: ctr)
 | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 51 |   case T_BTAG("<a>")::rest => interpret(rest, c, Console.UNDERLINED :: ctr)
 | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 52 |   case T_BTAG("<cyan>")::rest => interpret(rest, c, Console.CYAN :: ctr)
 | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 53 |   case T_BTAG("<red>")::rest => interpret(rest, c, Console.RED :: ctr)
 | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 54 |   case T_BTAG("<blink>")::rest => interpret(rest, c, Console.BLINK :: ctr)
 | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 55 | case T_ETAG(_)::rest => interpret(rest, c, ctr.tail) | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 56 | case _::rest => interpret(rest, c, ctr) | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 57 | } | 
| 
9215b9fb8852
tuned
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff
changeset | 58 | |
| 71 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 59 | val test_string = """ | 
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 60 | <b>MSc Projects</b> | 
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 61 | |
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 62 | <p> | 
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 63 | start of paragraph. <cyan> a <red>cyan</red> word</cyan> normal again something longer. | 
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 64 | </p> | 
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 65 | |
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 66 | |
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 67 | <p><b>Description:</b> | 
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 68 | <a>Regular expressions</a> are extremely useful for many text-processing tasks such as finding patterns in texts, | 
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 69 | lexing programs, syntax highlighting and so on. Given that regular expressions were | 
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 70 | introduced in 1950 by <a>Stephen Kleene</a>, you might think | 
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 71 | regular expressions have since been studied and implemented to death. But you would definitely be mistaken: in fact they are still | 
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 72 | an active research area. For example | 
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 73 | <a>this paper</a> | 
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 74 | about regular expression matching and partial derivatives was presented this summer at the international | 
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 75 | PPDP'12 conference. The task in this project is to implement the results from this paper.</p> | 
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 76 | |
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 77 | <p>The background for this project is that some regular expressions are | 
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 78 | <a>evil</a> | 
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 79 | and can stab you in the back; according to | 
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 80 | this <a>blog post</a>. | 
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 81 | For example, if you use in <a>Python</a> or | 
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 82 | in <a>Ruby</a> (probably also in other mainstream programming languages) the | 
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 83 |   innocently looking regular expression a?{28}a{28} and match it, say, against the string 
 | 
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 84 | <red>aaaaaaaaaa<cyan>aaaaaaa</cyan>aaaaaaaaaaa</red> (that is 28 as), you will soon notice that your CPU usage goes to 100%. In fact, | 
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 85 | Python and Ruby need approximately 30 seconds of hard work for matching this string. You can try it for yourself: | 
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 86 | <a>re.py</a> (Python version) and | 
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 87 | <a>re.rb</a> | 
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 88 | (Ruby version). You can imagine an attacker | 
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 89 | mounting a nice <a>DoS attack</a> against | 
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 90 | your program if it contains such an evil regular expression. Actually | 
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 91 | <a>Scala</a> (and also Java) are almost immune from such | 
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 92 | attacks as they can deal with strings of up to 4,300 as in less than a second. But if you scale | 
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 93 | the regular expression and string further to, say, 4,600 as, then you get a | 
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 94 | StackOverflowError | 
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 95 | potentially crashing your program. | 
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 96 | </p> | 
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 97 | """ | 
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 98 | |
| 
7717f20f0504
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
66diff
changeset | 99 | interpret(T.fromString(test_string), 0, Nil) |