regexp3.scala
changeset 26 06be91bbb1cd
child 28 f63ba92a7d78
equal deleted inserted replaced
25:94133a0e62ed 26:06be91bbb1cd
       
     1 // regular expressions
       
     2 abstract class Rexp
       
     3 
       
     4 case object NULL extends Rexp
       
     5 case object EMPTY extends Rexp
       
     6 case class CHAR(c: Char) extends Rexp
       
     7 case class ALT(r1: Rexp, r2: Rexp) extends Rexp
       
     8 case class SEQ(r1: Rexp, r2: Rexp) extends Rexp
       
     9 case class STAR(r: Rexp) extends Rexp
       
    10 case class NOT(r: Rexp) extends Rexp
       
    11 
       
    12 
       
    13 // some convenience for typing in regular expressions
       
    14 def charlist2rexp(s : List[Char]) : Rexp = s match {
       
    15   case Nil => EMPTY
       
    16   case c::Nil => CHAR(c)
       
    17   case c::s => SEQ(CHAR(c), charlist2rexp(s))
       
    18 }
       
    19 implicit def string2rexp(s : String) : Rexp = charlist2rexp(s.toList)
       
    20 
       
    21 
       
    22 // nullable function: tests whether the regular 
       
    23 // expression can recognise the empty string
       
    24 def nullable (r: Rexp) : Boolean = r match {
       
    25   case NULL => false
       
    26   case EMPTY => true
       
    27   case CHAR(_) => false
       
    28   case ALT(r1, r2) => nullable(r1) || nullable(r2)
       
    29   case SEQ(r1, r2) => nullable(r1) && nullable(r2)
       
    30   case STAR(_) => true
       
    31   case NOT(r) => !(nullable(r))
       
    32 }
       
    33 
       
    34 // tests whether a regular expression 
       
    35 // recognises nothing
       
    36 def zeroable (r: Rexp) : Boolean = r match {
       
    37   case NULL => true
       
    38   case EMPTY => false
       
    39   case CHAR(_) => false
       
    40   case ALT(r1, r2) => zeroable(r1) && zeroable(r2)
       
    41   case SEQ(r1, r2) => if (nullable(r1)) (zeroable(r1) && zeroable(r2)) else zeroable(r1)
       
    42       //zeroable(r1) || zeroable(r2)
       
    43   case STAR(_) => false
       
    44   case NOT(r) => !(zeroable(r))
       
    45 }
       
    46 
       
    47 
       
    48 // derivative of a regular expression w.r.t. a character
       
    49 def der (c: Char, r: Rexp) : Rexp = r match {
       
    50   case NULL => NULL
       
    51   case EMPTY => NULL
       
    52   case CHAR(d) => if (c == d) EMPTY else NULL
       
    53   case ALT(r1, r2) => ALT(der(c, r1), der(c, r2))
       
    54   case SEQ(r1, r2) => 
       
    55     if (nullable(r1)) ALT(SEQ(der(c, r1), r2), der(c, r2))
       
    56     else SEQ(der(c, r1), r2)
       
    57   case STAR(r) => SEQ(der(c, r), STAR(r))
       
    58   case NOT(r) => NOT(der (c, r))
       
    59 }
       
    60 
       
    61 // derivative w.r.t. a string (iterates der)
       
    62 def ders (s: List[Char], r: Rexp) : Rexp = s match {
       
    63   case Nil => r
       
    64   case c::s => ders(s, der(c, r))
       
    65 }
       
    66 
       
    67 // main matcher function
       
    68 def matcher(r: Rexp, s: String) : Boolean = nullable(ders(s.toList, r))
       
    69 
       
    70 
       
    71 // regular expression for specifying 
       
    72 // ranges of characters
       
    73 def RANGE(s : List[Char]) : Rexp = s match {
       
    74   case Nil => NULL
       
    75   case c::Nil => CHAR(c)
       
    76   case c::s => ALT(CHAR(c), RANGE(s))
       
    77 }
       
    78 
       
    79 //one or more
       
    80 def PLUS(r: Rexp) = SEQ(r, STAR(r))
       
    81 
       
    82 
       
    83 //some regular expressions
       
    84 val LOWERCASE = RANGE("abcdefghijklmnopqrstuvwxyz".toList)
       
    85 val UPPERCASE = RANGE("ABCDEFGHIJKLMNOPQRSTUVWXYZ".toList)
       
    86 val LETTER = ALT(LOWERCASE, UPPERCASE)
       
    87 val DIGITS = RANGE("0123456789".toList)
       
    88 val NONZERODIGITS = RANGE("123456789".toList)
       
    89 
       
    90 val IDENT = SEQ(LETTER, STAR(ALT(LETTER,DIGITS)))
       
    91 val NUMBER = ALT(SEQ(NONZERODIGITS, STAR(DIGITS)), "0")
       
    92 val WHITESPACE = RANGE(" \n".toList)
       
    93 
       
    94 val ALL = ALT(ALT(LETTER, DIGITS), WHITESPACE)
       
    95 
       
    96 val COMMENT = SEQ(SEQ("/*", NOT(SEQ(SEQ(STAR(ALL), "*/"), STAR(ALL)))), "*/")
       
    97 
       
    98 abstract class Token
       
    99 
       
   100 case object T_WHITESPACE extends Token
       
   101 case class T_IDENT(s: String) extends Token
       
   102 case class T_OP(s: String) extends Token
       
   103 case class T_NUM(n: Int) extends Token
       
   104 case class T_KEYWORD(s: String) extends Token
       
   105 
       
   106 
       
   107 // an example list of rules
       
   108 type Rule = (Rexp, List[Char] => Token)
       
   109 
       
   110 val rules: List[Rule]= 
       
   111   List(("if", (s) => T_KEYWORD(s.mkString)),
       
   112        ("then", (s) => T_KEYWORD(s.mkString)),
       
   113        ("else", (s) => T_KEYWORD(s.mkString)),
       
   114        ("+", (s) => T_OP(s.mkString)),
       
   115        (IDENT, (s) => T_IDENT(s.mkString)),
       
   116        (NUMBER, (s) => T_NUM(s.mkString.toInt)),
       
   117        (WHITESPACE, (s) => T_WHITESPACE))
       
   118 
       
   119 
       
   120 def error (s: String) = throw new IllegalArgumentException ("Could not lex " + s)
       
   121 
       
   122 def munch(r: Rexp, action: List[Char] => Token, s: List[Char], t: List[Char]) : Option[(List[Char], Token)] = 
       
   123   s match {
       
   124     case Nil if (nullable(r)) => Some(Nil, action(t))
       
   125     case Nil => None
       
   126     case c::s if (zeroable(der (c, r)) && nullable(r)) => Some(c::s, action(t))
       
   127     case c::s if (zeroable(der (c, r))) => None
       
   128     case c::s => munch(der (c, r), action, s, t ::: List(c))
       
   129   }
       
   130 
       
   131 def lex_one (rs: List[Rule], s: List[Char]) : (List[Char], Token) = {
       
   132  val somes = rs.map { (r) => munch(r._1, r._2, s, Nil) } .flatten
       
   133  if (somes == Nil) error(s.mkString) else (somes sortBy (_._1.length) head)
       
   134 }
       
   135 
       
   136 def lex_all (rs: List[Rule], s: List[Char]) : List[Token] = s match {
       
   137   case Nil => Nil
       
   138   case _ => lex_one(rs, s) match {
       
   139     case (rest, t) => t :: lex_all(rs, rest) 
       
   140   }
       
   141 }
       
   142 
       
   143 
       
   144 
       
   145 println(lex_all(rules, "if true then 42 else +".toList))
       
   146 println(lex_all(rules, "ifff if     34 34".toList))
       
   147 println(lex_all(rules, "ifff $ if 34".toList))
       
   148 
       
   149