progs/html.scala
author Christian Urban <christian dot urban at kcl dot ac dot uk>
Mon, 20 Jul 2020 16:46:30 +0100
changeset 736 d3e477fe6c66
parent 93 4794759139ea
permissions -rw-r--r--
updated
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
66
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
     1
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
     2
//:load matcher.scala
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
     3
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
     4
// some regular expressions
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
     5
val SYM = RANGE("""ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz.,!?-{[()]}':;%0123456789""")
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
     6
val WORD = PLUS(SYM)
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
     7
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
     8
val BTAG = SEQS("<", WORD, ">") 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
     9
val ETAG = SEQS("</", WORD, ">")
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    10
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    11
val WHITESPACE = PLUS(RANGE(" \n"))
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    12
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    13
// for classifying the strings that have been recognised
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    14
abstract class Token
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    15
case object T_WHITESPACE extends Token
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    16
case class T_WORD(s: String) extends Token
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    17
case class T_ETAG(s: String) extends Token
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    18
case class T_BTAG(s: String) extends Token
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    19
case class T_NT(s: String, rhs: List[Token]) extends Token
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    20
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    21
val lexing_rules: List[Rule[Token]] = 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    22
  List((BTAG, (s) => T_BTAG(s.mkString)),
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    23
       (ETAG, (s) => T_ETAG(s.mkString)),
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    24
       (WORD, (s) => T_WORD(s.mkString)),
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    25
       (WHITESPACE, (s) => T_WHITESPACE))
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    26
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    27
// the tokenizer
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    28
val T = Tokenizer(lexing_rules)
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    29
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    30
// width for printing
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    31
val WIDTH = 60
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    32
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    33
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    34
def interpret(ts: List[Token], c: Int, ctr: List[String]) : Unit= ts match {
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    35
  case Nil => println(Console.RESET)
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    36
  case T_WHITESPACE::rest => print(Console.RESET + " "); interpret(rest, c + 1, ctr)
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    37
  case T_WORD(s)::rest => {
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    38
    val newstr = Console.RESET + ctr.reverse.mkString + s
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    39
    if (c + s.length < WIDTH) {
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    40
      print(newstr);
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    41
      interpret(rest, c + s.length, ctr)
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    42
    }
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    43
    else {
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    44
      print("\n" + newstr)
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    45
      interpret(rest, s.length, ctr)
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    46
    } 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    47
  }
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    48
  case T_BTAG("<p>")::rest => print("\n"); interpret(rest, 0, ctr)
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    49
  case T_ETAG("</p>")::rest => print("\n"); interpret(rest, 0, ctr)
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    50
  case T_BTAG("<b>")::rest => interpret(rest, c, Console.BOLD :: ctr)
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    51
  case T_BTAG("<a>")::rest => interpret(rest, c, Console.UNDERLINED :: ctr)
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    52
  case T_BTAG("<cyan>")::rest => interpret(rest, c, Console.CYAN :: ctr)
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    53
  case T_BTAG("<red>")::rest => interpret(rest, c, Console.RED :: ctr)
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    54
  case T_BTAG("<blink>")::rest => interpret(rest, c, Console.BLINK :: ctr)
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    55
  case T_ETAG(_)::rest => interpret(rest, c, ctr.tail)
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    56
  case _::rest => interpret(rest, c, ctr)
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    57
}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    58
 
71
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    59
val test_string = """
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    60
<b>MSc Projects</b>
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    61
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    62
<p>
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    63
start of paragraph. <cyan> a <red>cyan</red> word</cyan> normal again something longer.
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    64
</p> 
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    65
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    66
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    67
 <p><b>Description:</b>  
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    68
  <a>Regular expressions</a> are extremely useful for many text-processing tasks such as finding patterns in texts,
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    69
  lexing programs, syntax highlighting and so on. Given that regular expressions were
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    70
  introduced in 1950 by <a>Stephen Kleene</a>, you might think 
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    71
  regular expressions have since been studied and implemented to death. But you would definitely be mistaken: in fact they are still
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    72
  an active research area. For example
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    73
  <a>this paper</a> 
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    74
  about regular expression matching and partial derivatives was presented this summer at the international 
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    75
  PPDP'12 conference. The task in this project is to implement the results from this paper.</p>
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    76
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    77
  <p>The background for this project is that some regular expressions are 
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    78
  <a>evil</a>
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    79
  and can stab you in the back; according to
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    80
  this <a>blog post</a>.
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    81
  For example, if you use in <a>Python</a> or 
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    82
  in <a>Ruby</a> (probably also in other mainstream programming languages) the 
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    83
  innocently looking regular expression a?{28}a{28} and match it, say, against the string 
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    84
  <red>aaaaaaaaaa<cyan>aaaaaaa</cyan>aaaaaaaaaaa</red> (that is 28 as), you will soon notice that your CPU usage goes to 100%. In fact,
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    85
  Python and Ruby need approximately 30 seconds of hard work for matching this string. You can try it for yourself:
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    86
  <a>re.py</a> (Python version) and 
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    87
  <a>re.rb</a> 
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    88
  (Ruby version). You can imagine an attacker
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    89
  mounting a nice <a>DoS attack</a> against 
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    90
  your program if it contains such an evil regular expression. Actually 
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    91
  <a>Scala</a> (and also Java) are almost immune from such
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    92
  attacks as they can deal with strings of up to 4,300 as in less than a second. But if you scale
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    93
  the regular expression and string further to, say, 4,600 as, then you get a 
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    94
  StackOverflowError 
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    95
  potentially crashing your program.
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    96
  </p>
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    97
"""
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    98
7717f20f0504 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 66
diff changeset
    99
interpret(T.fromString(test_string), 0, Nil)