progs/html.scala
changeset 93 4794759139ea
parent 92 e85600529ca5
equal deleted inserted replaced
92:e85600529ca5 93:4794759139ea
       
     1 
       
     2 //:load matcher.scala
       
     3 
       
     4 // some regular expressions
       
     5 val SYM = RANGE("""ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz.,!?-{[()]}':;%0123456789""")
       
     6 val WORD = PLUS(SYM)
       
     7 
       
     8 val BTAG = SEQS("<", WORD, ">") 
       
     9 val ETAG = SEQS("</", WORD, ">")
       
    10 
       
    11 val WHITESPACE = PLUS(RANGE(" \n"))
       
    12 
       
    13 // for classifying the strings that have been recognised
       
    14 abstract class Token
       
    15 case object T_WHITESPACE extends Token
       
    16 case class T_WORD(s: String) extends Token
       
    17 case class T_ETAG(s: String) extends Token
       
    18 case class T_BTAG(s: String) extends Token
       
    19 case class T_NT(s: String, rhs: List[Token]) extends Token
       
    20 
       
    21 val lexing_rules: List[Rule[Token]] = 
       
    22   List((BTAG, (s) => T_BTAG(s.mkString)),
       
    23        (ETAG, (s) => T_ETAG(s.mkString)),
       
    24        (WORD, (s) => T_WORD(s.mkString)),
       
    25        (WHITESPACE, (s) => T_WHITESPACE))
       
    26 
       
    27 // the tokenizer
       
    28 val T = Tokenizer(lexing_rules)
       
    29 
       
    30 // width for printing
       
    31 val WIDTH = 60
       
    32 
       
    33 
       
    34 def interpret(ts: List[Token], c: Int, ctr: List[String]) : Unit= ts match {
       
    35   case Nil => println(Console.RESET)
       
    36   case T_WHITESPACE::rest => print(Console.RESET + " "); interpret(rest, c + 1, ctr)
       
    37   case T_WORD(s)::rest => {
       
    38     val newstr = Console.RESET + ctr.reverse.mkString + s
       
    39     if (c + s.length < WIDTH) {
       
    40       print(newstr);
       
    41       interpret(rest, c + s.length, ctr)
       
    42     }
       
    43     else {
       
    44       print("\n" + newstr)
       
    45       interpret(rest, s.length, ctr)
       
    46     } 
       
    47   }
       
    48   case T_BTAG("<p>")::rest => print("\n"); interpret(rest, 0, ctr)
       
    49   case T_ETAG("</p>")::rest => print("\n"); interpret(rest, 0, ctr)
       
    50   case T_BTAG("<b>")::rest => interpret(rest, c, Console.BOLD :: ctr)
       
    51   case T_BTAG("<a>")::rest => interpret(rest, c, Console.UNDERLINED :: ctr)
       
    52   case T_BTAG("<cyan>")::rest => interpret(rest, c, Console.CYAN :: ctr)
       
    53   case T_BTAG("<red>")::rest => interpret(rest, c, Console.RED :: ctr)
       
    54   case T_BTAG("<blink>")::rest => interpret(rest, c, Console.BLINK :: ctr)
       
    55   case T_ETAG(_)::rest => interpret(rest, c, ctr.tail)
       
    56   case _::rest => interpret(rest, c, ctr)
       
    57 }
       
    58  
       
    59 val test_string = """
       
    60 <b>MSc Projects</b>
       
    61 
       
    62 <p>
       
    63 start of paragraph. <cyan> a <red>cyan</red> word</cyan> normal again something longer.
       
    64 </p> 
       
    65 
       
    66 
       
    67  <p><b>Description:</b>  
       
    68   <a>Regular expressions</a> are extremely useful for many text-processing tasks such as finding patterns in texts,
       
    69   lexing programs, syntax highlighting and so on. Given that regular expressions were
       
    70   introduced in 1950 by <a>Stephen Kleene</a>, you might think 
       
    71   regular expressions have since been studied and implemented to death. But you would definitely be mistaken: in fact they are still
       
    72   an active research area. For example
       
    73   <a>this paper</a> 
       
    74   about regular expression matching and partial derivatives was presented this summer at the international 
       
    75   PPDP'12 conference. The task in this project is to implement the results from this paper.</p>
       
    76 
       
    77   <p>The background for this project is that some regular expressions are 
       
    78   <a>evil</a>
       
    79   and can stab you in the back; according to
       
    80   this <a>blog post</a>.
       
    81   For example, if you use in <a>Python</a> or 
       
    82   in <a>Ruby</a> (probably also in other mainstream programming languages) the 
       
    83   innocently looking regular expression a?{28}a{28} and match it, say, against the string 
       
    84   <red>aaaaaaaaaa<cyan>aaaaaaa</cyan>aaaaaaaaaaa</red> (that is 28 as), you will soon notice that your CPU usage goes to 100%. In fact,
       
    85   Python and Ruby need approximately 30 seconds of hard work for matching this string. You can try it for yourself:
       
    86   <a>re.py</a> (Python version) and 
       
    87   <a>re.rb</a> 
       
    88   (Ruby version). You can imagine an attacker
       
    89   mounting a nice <a>DoS attack</a> against 
       
    90   your program if it contains such an evil regular expression. Actually 
       
    91   <a>Scala</a> (and also Java) are almost immune from such
       
    92   attacks as they can deal with strings of up to 4,300 as in less than a second. But if you scale
       
    93   the regular expression and string further to, say, 4,600 as, then you get a 
       
    94   StackOverflowError 
       
    95   potentially crashing your program.
       
    96   </p>
       
    97 """
       
    98 
       
    99 interpret(T.fromString(test_string), 0, Nil)