progs/regexp5.scala
changeset 93 4794759139ea
parent 92 e85600529ca5
equal deleted inserted replaced
92:e85600529ca5 93:4794759139ea
       
     1 // regular expressions
       
     2 abstract class Rexp
       
     3 
       
     4 case object NULL extends Rexp
       
     5 case object EMPTY extends Rexp
       
     6 case class CHAR(c: Char) extends Rexp
       
     7 case class ALT(r1: Rexp, r2: Rexp) extends Rexp
       
     8 case class SEQ(r1: Rexp, r2: Rexp) extends Rexp
       
     9 case class STAR(r: Rexp) extends Rexp
       
    10 case class NOT(r: Rexp) extends Rexp
       
    11 
       
    12 
       
    13 // some convenience for typing in regular expressions
       
    14 def charlist2rexp(s : List[Char]) : Rexp = s match {
       
    15   case Nil => EMPTY
       
    16   case c::Nil => CHAR(c)
       
    17   case c::s => SEQ(CHAR(c), charlist2rexp(s))
       
    18 }
       
    19 implicit def string2rexp(s : String) : Rexp = charlist2rexp(s.toList)
       
    20 
       
    21 
       
    22 // nullable function: tests whether the regular 
       
    23 // expression can recognise the empty string
       
    24 def nullable (r: Rexp) : Boolean = r match {
       
    25   case NULL => false
       
    26   case EMPTY => true
       
    27   case CHAR(_) => false
       
    28   case ALT(r1, r2) => nullable(r1) || nullable(r2)
       
    29   case SEQ(r1, r2) => nullable(r1) && nullable(r2)
       
    30   case STAR(_) => true
       
    31   case NOT(r) => !(nullable(r))
       
    32 }
       
    33 
       
    34 // tests whether a regular expression 
       
    35 // recognises nothing
       
    36 def zeroable (r: Rexp) : Boolean = r match {
       
    37   case NULL => true
       
    38   case EMPTY => false
       
    39   case CHAR(_) => false
       
    40   case ALT(r1, r2) => zeroable(r1) && zeroable(r2)
       
    41   case SEQ(r1, r2) => if (nullable(r1)) (zeroable(r1) && zeroable(r2)) else zeroable(r1)
       
    42       //zeroable(r1) || zeroable(r2)
       
    43   case STAR(_) => false
       
    44   case NOT(r) => !(zeroable(r))
       
    45 }
       
    46 
       
    47 
       
    48 // derivative of a regular expression w.r.t. a character
       
    49 def der (c: Char, r: Rexp) : Rexp = r match {
       
    50   case NULL => NULL
       
    51   case EMPTY => NULL
       
    52   case CHAR(d) => if (c == d) EMPTY else NULL
       
    53   case ALT(r1, r2) => ALT(der(c, r1), der(c, r2))
       
    54   case SEQ(r1, r2) => 
       
    55     if (nullable(r1)) ALT(SEQ(der(c, r1), r2), der(c, r2))
       
    56     else SEQ(der(c, r1), r2)
       
    57   case STAR(r) => SEQ(der(c, r), STAR(r))
       
    58   case NOT(r) => NOT(der (c, r))
       
    59 }
       
    60 
       
    61 // derivative w.r.t. a string (iterates der)
       
    62 def ders (s: List[Char], r: Rexp) : Rexp = s match {
       
    63   case Nil => r
       
    64   case c::s => ders(s, der(c, r))
       
    65 }
       
    66 
       
    67 // main matcher function
       
    68 def matcher(r: Rexp, s: String) : Boolean = nullable(ders(s.toList, r))
       
    69 
       
    70 
       
    71 // regular expression for specifying 
       
    72 // ranges of characters
       
    73 def RANGE(s : List[Char]) : Rexp = s match {
       
    74   case Nil => NULL
       
    75   case c::Nil => CHAR(c)
       
    76   case c::s => ALT(CHAR(c), RANGE(s))
       
    77 }
       
    78 
       
    79 //one or more
       
    80 def PLUS(r: Rexp) = SEQ(r, STAR(r))
       
    81 
       
    82 
       
    83 //some regular expressions
       
    84 val LOWERCASE = RANGE("abcdefghijklmnopqrstuvwxyz".toList)
       
    85 val UPPERCASE = RANGE("ABCDEFGHIJKLMNOPQRSTUVWXYZ".toList)
       
    86 val LETTER = ALT(LOWERCASE, UPPERCASE)
       
    87 val DIGITS = RANGE("0123456789".toList)
       
    88 val NONZERODIGITS = RANGE("123456789".toList)
       
    89 
       
    90 val IDENT = SEQ(LETTER, STAR(ALT(LETTER,DIGITS)))
       
    91 val NUMBER = ALT(SEQ(NONZERODIGITS, STAR(DIGITS)), "0")
       
    92 val WHITESPACE = RANGE(" \n".toList)
       
    93 
       
    94 val ALL = ALT(ALT(LETTER, DIGITS), WHITESPACE)
       
    95 
       
    96 val COMMENT = SEQ(SEQ("/*", NOT(SEQ(SEQ(STAR(ALL), "*/"), STAR(ALL)))), "*/")
       
    97 
       
    98 println(matcher(NUMBER, "0"))
       
    99 println(matcher(NUMBER, "01"))
       
   100 println(matcher(NUMBER, "123450"))
       
   101 
       
   102 println(matcher(SEQ(STAR("a"), STAR("b")), "bbaaa"))
       
   103 println(matcher(ALT(STAR("a"), STAR("b")), ""))
       
   104 println(matcher("abc", ""))
       
   105 println(matcher(STAR(ALT(EMPTY, "a")), ""))
       
   106 println(matcher(STAR(EMPTY), "a"))
       
   107 println(matcher("cab","cab"))
       
   108 println(matcher(STAR("a"),"aaa"))
       
   109 println(matcher("cab" ,"cab"))
       
   110 println(matcher(STAR("a"),"aaa"))
       
   111 
       
   112 println(matcher(COMMENT, "/* */"))
       
   113 println(matcher(COMMENT, "/* 34 */"))
       
   114 println(matcher(COMMENT, "/* foobar comment */"))
       
   115 println(matcher(COMMENT, "/* test */ test */"))
       
   116 
       
   117 // an example list of regular expressions
       
   118 
       
   119 abstract class Token
       
   120 
       
   121 case object T_WHITESPACE extends Token
       
   122 case object T_COMMENT extends Token
       
   123 case class T_IDENT(s: String) extends Token
       
   124 case class T_OP(s: String) extends Token
       
   125 case class T_NUM(n: Int) extends Token
       
   126 case class T_KEYWORD(s: String) extends Token
       
   127 
       
   128 val regs: List[Rexp]=  List("if", "then", "else", "+", IDENT, NUMBER, WHITESPACE) 
       
   129 
       
   130 type Rule = (Rexp, List[Char] => Token)
       
   131 
       
   132 val rules: List[Rule]= 
       
   133   List(("if", (s) => T_KEYWORD(s.mkString)),
       
   134        ("then", (s) => T_KEYWORD(s.mkString)),
       
   135        ("else", (s) => T_KEYWORD(s.mkString)),
       
   136        ("+", (s) => T_OP(s.mkString)),
       
   137        (IDENT, (s) => T_IDENT(s.mkString)),
       
   138        (NUMBER, (s) => T_NUM(s.mkString.toInt)),
       
   139        (WHITESPACE, (s) => T_WHITESPACE),
       
   140        (COMMENT, (s) => T_COMMENT)) 
       
   141 
       
   142 
       
   143 def error (s: String) = throw new IllegalArgumentException ("Could not lex " + s)
       
   144 
       
   145 def munch(r: Rexp, action: List[Char] => Token, s: List[Char], t: List[Char]) : Option[(List[Char], Token)] = 
       
   146 { println("string " + s)
       
   147   println("  rexp " + r)
       
   148   s match {
       
   149     case Nil if (nullable(r)) => Some(Nil, action(t))
       
   150     case Nil => { println("1"); None }
       
   151     case c::s if (zeroable(der (c, r)) && nullable(r)) => Some(c::s, action(t))
       
   152     case c::s if (zeroable(der (c, r))) => { println("2"); None }
       
   153     case c::s => munch(der (c, r), action, s, t ::: List(c))
       
   154   }
       
   155 }
       
   156 
       
   157 def lex_one (rs: List[Rule], s: List[Char]) : (List[Char], Token) = {
       
   158  val somes = rs.map { (r) => munch(r._1, r._2, s, Nil) } .flatten
       
   159  if (somes == Nil) error(s.mkString) else (somes sortBy (_._1.length) head)
       
   160 }
       
   161 
       
   162 def lex_all (rs: List[Rule], s: List[Char]) : List[Token] = s match {
       
   163   case Nil => Nil
       
   164   case _ => lex_one(rs, s) match {
       
   165     case (rest, t) => t :: lex_all(rs, rest) 
       
   166   }
       
   167 }
       
   168 
       
   169 
       
   170 
       
   171 println(matcher(COMMENT, "/*ifff if     34 34*/"))
       
   172 rules.map { (r) => munch(r._1, r._2, "/*ifff if     34 34*/  ".toList, Nil) }
       
   173 println(lex_all(rules, "ifff if     34 34".toList))
       
   174 println(lex_all(rules, "  /*ifff if     34 34*/  ".toList))
       
   175 println(lex_all(rules, "ifff $ if 34".toList))
       
   176 
       
   177