author | Christian Urban <christian.urban@kcl.ac.uk> |
Mon, 31 Aug 2020 16:57:15 +0100 | |
changeset 748 | 383f2a5952ce |
parent 93 | 4794759139ea |
permissions | -rw-r--r-- |
66
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
1 |
|
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
2 |
//:load matcher.scala |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
3 |
|
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
4 |
// some regular expressions |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
5 |
val SYM = RANGE("""ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz.,!?-{[()]}':;%0123456789""") |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
6 |
val WORD = PLUS(SYM) |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
7 |
|
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
8 |
val BTAG = SEQS("<", WORD, ">") |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
9 |
val ETAG = SEQS("</", WORD, ">") |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
10 |
|
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
11 |
val WHITESPACE = PLUS(RANGE(" \n")) |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
12 |
|
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
13 |
// for classifying the strings that have been recognised |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
14 |
abstract class Token |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
15 |
case object T_WHITESPACE extends Token |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
16 |
case class T_WORD(s: String) extends Token |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
17 |
case class T_ETAG(s: String) extends Token |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
18 |
case class T_BTAG(s: String) extends Token |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
19 |
case class T_NT(s: String, rhs: List[Token]) extends Token |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
20 |
|
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
21 |
val lexing_rules: List[Rule[Token]] = |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
22 |
List((BTAG, (s) => T_BTAG(s.mkString)), |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
23 |
(ETAG, (s) => T_ETAG(s.mkString)), |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
24 |
(WORD, (s) => T_WORD(s.mkString)), |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
25 |
(WHITESPACE, (s) => T_WHITESPACE)) |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
26 |
|
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
27 |
// the tokenizer |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
28 |
val T = Tokenizer(lexing_rules) |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
29 |
|
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
30 |
// width for printing |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
31 |
val WIDTH = 60 |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
32 |
|
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
33 |
|
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
34 |
def interpret(ts: List[Token], c: Int, ctr: List[String]) : Unit= ts match { |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
35 |
case Nil => println(Console.RESET) |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
36 |
case T_WHITESPACE::rest => print(Console.RESET + " "); interpret(rest, c + 1, ctr) |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
37 |
case T_WORD(s)::rest => { |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
38 |
val newstr = Console.RESET + ctr.reverse.mkString + s |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
39 |
if (c + s.length < WIDTH) { |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
40 |
print(newstr); |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
41 |
interpret(rest, c + s.length, ctr) |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
42 |
} |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
43 |
else { |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
44 |
print("\n" + newstr) |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
45 |
interpret(rest, s.length, ctr) |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
46 |
} |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
47 |
} |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
48 |
case T_BTAG("<p>")::rest => print("\n"); interpret(rest, 0, ctr) |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
49 |
case T_ETAG("</p>")::rest => print("\n"); interpret(rest, 0, ctr) |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
50 |
case T_BTAG("<b>")::rest => interpret(rest, c, Console.BOLD :: ctr) |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
51 |
case T_BTAG("<a>")::rest => interpret(rest, c, Console.UNDERLINED :: ctr) |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
52 |
case T_BTAG("<cyan>")::rest => interpret(rest, c, Console.CYAN :: ctr) |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
53 |
case T_BTAG("<red>")::rest => interpret(rest, c, Console.RED :: ctr) |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
54 |
case T_BTAG("<blink>")::rest => interpret(rest, c, Console.BLINK :: ctr) |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
55 |
case T_ETAG(_)::rest => interpret(rest, c, ctr.tail) |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
56 |
case _::rest => interpret(rest, c, ctr) |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
57 |
} |
9215b9fb8852
tuned
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
58 |
|
71
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
59 |
val test_string = """ |
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
60 |
<b>MSc Projects</b> |
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
61 |
|
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
62 |
<p> |
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
63 |
start of paragraph. <cyan> a <red>cyan</red> word</cyan> normal again something longer. |
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
64 |
</p> |
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
65 |
|
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
66 |
|
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
67 |
<p><b>Description:</b> |
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
68 |
<a>Regular expressions</a> are extremely useful for many text-processing tasks such as finding patterns in texts, |
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
69 |
lexing programs, syntax highlighting and so on. Given that regular expressions were |
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
70 |
introduced in 1950 by <a>Stephen Kleene</a>, you might think |
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
71 |
regular expressions have since been studied and implemented to death. But you would definitely be mistaken: in fact they are still |
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
72 |
an active research area. For example |
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
73 |
<a>this paper</a> |
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
74 |
about regular expression matching and partial derivatives was presented this summer at the international |
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
75 |
PPDP'12 conference. The task in this project is to implement the results from this paper.</p> |
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
76 |
|
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
77 |
<p>The background for this project is that some regular expressions are |
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
78 |
<a>evil</a> |
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
79 |
and can stab you in the back; according to |
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
80 |
this <a>blog post</a>. |
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
81 |
For example, if you use in <a>Python</a> or |
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
82 |
in <a>Ruby</a> (probably also in other mainstream programming languages) the |
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
83 |
innocently looking regular expression a?{28}a{28} and match it, say, against the string |
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
84 |
<red>aaaaaaaaaa<cyan>aaaaaaa</cyan>aaaaaaaaaaa</red> (that is 28 as), you will soon notice that your CPU usage goes to 100%. In fact, |
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
85 |
Python and Ruby need approximately 30 seconds of hard work for matching this string. You can try it for yourself: |
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
86 |
<a>re.py</a> (Python version) and |
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
87 |
<a>re.rb</a> |
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
88 |
(Ruby version). You can imagine an attacker |
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
89 |
mounting a nice <a>DoS attack</a> against |
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
90 |
your program if it contains such an evil regular expression. Actually |
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
91 |
<a>Scala</a> (and also Java) are almost immune from such |
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
92 |
attacks as they can deal with strings of up to 4,300 as in less than a second. But if you scale |
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
93 |
the regular expression and string further to, say, 4,600 as, then you get a |
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
94 |
StackOverflowError |
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
95 |
potentially crashing your program. |
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
96 |
</p> |
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
97 |
""" |
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
98 |
|
7717f20f0504
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
66
diff
changeset
|
99 |
interpret(T.fromString(test_string), 0, Nil) |