| author | Christian Urban <christian.urban@kcl.ac.uk> | 
| Tue, 30 May 2023 13:27:54 +0100 | |
| changeset 908 | 68df565a6134 | 
| parent 93 | 4794759139ea | 
| permissions | -rw-r--r-- | 
| 
66
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
1  | 
|
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
2  | 
//:load matcher.scala  | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
3  | 
|
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
4  | 
// some regular expressions  | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
5  | 
val SYM = RANGE("""ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz.,!?-{[()]}':;%0123456789""")
 | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
6  | 
val WORD = PLUS(SYM)  | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
7  | 
|
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
8  | 
val BTAG = SEQS("<", WORD, ">") 
 | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
9  | 
val ETAG = SEQS("</", WORD, ">")
 | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
10  | 
|
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
11  | 
val WHITESPACE = PLUS(RANGE(" \n"))
 | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
12  | 
|
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
13  | 
// for classifying the strings that have been recognised  | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
14  | 
abstract class Token  | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
15  | 
case object T_WHITESPACE extends Token  | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
16  | 
case class T_WORD(s: String) extends Token  | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
17  | 
case class T_ETAG(s: String) extends Token  | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
18  | 
case class T_BTAG(s: String) extends Token  | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
19  | 
case class T_NT(s: String, rhs: List[Token]) extends Token  | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
20  | 
|
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
21  | 
val lexing_rules: List[Rule[Token]] =  | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
22  | 
List((BTAG, (s) => T_BTAG(s.mkString)),  | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
23  | 
(ETAG, (s) => T_ETAG(s.mkString)),  | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
24  | 
(WORD, (s) => T_WORD(s.mkString)),  | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
25  | 
(WHITESPACE, (s) => T_WHITESPACE))  | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
26  | 
|
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
27  | 
// the tokenizer  | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
28  | 
val T = Tokenizer(lexing_rules)  | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
29  | 
|
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
30  | 
// width for printing  | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
31  | 
val WIDTH = 60  | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
32  | 
|
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
33  | 
|
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
34  | 
def interpret(ts: List[Token], c: Int, ctr: List[String]) : Unit= ts match {
 | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
35  | 
case Nil => println(Console.RESET)  | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
36  | 
case T_WHITESPACE::rest => print(Console.RESET + " "); interpret(rest, c + 1, ctr)  | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
37  | 
  case T_WORD(s)::rest => {
 | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
38  | 
val newstr = Console.RESET + ctr.reverse.mkString + s  | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
39  | 
    if (c + s.length < WIDTH) {
 | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
40  | 
print(newstr);  | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
41  | 
interpret(rest, c + s.length, ctr)  | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
42  | 
}  | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
43  | 
    else {
 | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
44  | 
      print("\n" + newstr)
 | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
45  | 
interpret(rest, s.length, ctr)  | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
46  | 
}  | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
47  | 
}  | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
48  | 
  case T_BTAG("<p>")::rest => print("\n"); interpret(rest, 0, ctr)
 | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
49  | 
  case T_ETAG("</p>")::rest => print("\n"); interpret(rest, 0, ctr)
 | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
50  | 
  case T_BTAG("<b>")::rest => interpret(rest, c, Console.BOLD :: ctr)
 | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
51  | 
  case T_BTAG("<a>")::rest => interpret(rest, c, Console.UNDERLINED :: ctr)
 | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
52  | 
  case T_BTAG("<cyan>")::rest => interpret(rest, c, Console.CYAN :: ctr)
 | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
53  | 
  case T_BTAG("<red>")::rest => interpret(rest, c, Console.RED :: ctr)
 | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
54  | 
  case T_BTAG("<blink>")::rest => interpret(rest, c, Console.BLINK :: ctr)
 | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
55  | 
case T_ETAG(_)::rest => interpret(rest, c, ctr.tail)  | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
56  | 
case _::rest => interpret(rest, c, ctr)  | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
57  | 
}  | 
| 
 
9215b9fb8852
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents:  
diff
changeset
 | 
58  | 
|
| 
71
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
59  | 
val test_string = """  | 
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
60  | 
<b>MSc Projects</b>  | 
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
61  | 
|
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
62  | 
<p>  | 
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
63  | 
start of paragraph. <cyan> a <red>cyan</red> word</cyan> normal again something longer.  | 
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
64  | 
</p>  | 
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
65  | 
|
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
66  | 
|
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
67  | 
<p><b>Description:</b>  | 
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
68  | 
<a>Regular expressions</a> are extremely useful for many text-processing tasks such as finding patterns in texts,  | 
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
69  | 
lexing programs, syntax highlighting and so on. Given that regular expressions were  | 
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
70  | 
introduced in 1950 by <a>Stephen Kleene</a>, you might think  | 
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
71  | 
regular expressions have since been studied and implemented to death. But you would definitely be mistaken: in fact they are still  | 
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
72  | 
an active research area. For example  | 
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
73  | 
<a>this paper</a>  | 
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
74  | 
about regular expression matching and partial derivatives was presented this summer at the international  | 
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
75  | 
PPDP'12 conference. The task in this project is to implement the results from this paper.</p>  | 
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
76  | 
|
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
77  | 
<p>The background for this project is that some regular expressions are  | 
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
78  | 
<a>evil</a>  | 
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
79  | 
and can stab you in the back; according to  | 
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
80  | 
this <a>blog post</a>.  | 
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
81  | 
For example, if you use in <a>Python</a> or  | 
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
82  | 
in <a>Ruby</a> (probably also in other mainstream programming languages) the  | 
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
83  | 
  innocently looking regular expression a?{28}a{28} and match it, say, against the string 
 | 
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
84  | 
<red>aaaaaaaaaa<cyan>aaaaaaa</cyan>aaaaaaaaaaa</red> (that is 28 as), you will soon notice that your CPU usage goes to 100%. In fact,  | 
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
85  | 
Python and Ruby need approximately 30 seconds of hard work for matching this string. You can try it for yourself:  | 
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
86  | 
<a>re.py</a> (Python version) and  | 
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
87  | 
<a>re.rb</a>  | 
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
88  | 
(Ruby version). You can imagine an attacker  | 
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
89  | 
mounting a nice <a>DoS attack</a> against  | 
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
90  | 
your program if it contains such an evil regular expression. Actually  | 
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
91  | 
<a>Scala</a> (and also Java) are almost immune from such  | 
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
92  | 
attacks as they can deal with strings of up to 4,300 as in less than a second. But if you scale  | 
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
93  | 
the regular expression and string further to, say, 4,600 as, then you get a  | 
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
94  | 
StackOverflowError  | 
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
95  | 
potentially crashing your program.  | 
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
96  | 
</p>  | 
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
97  | 
"""  | 
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
98  | 
|
| 
 
7717f20f0504
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
66 
diff
changeset
 | 
99  | 
interpret(T.fromString(test_string), 0, Nil)  |