scala/regexp4.scala
changeset 92 e85600529ca5
equal deleted inserted replaced
91:47f86885d481 92:e85600529ca5
       
     1 // regular expressions
       
     2 abstract class Rexp
       
     3 
       
     4 case object NULL extends Rexp
       
     5 case object EMPTY extends Rexp
       
     6 case class CHAR(c: Char) extends Rexp
       
     7 case class ALT(r1: Rexp, r2: Rexp) extends Rexp
       
     8 case class SEQ(r1: Rexp, r2: Rexp) extends Rexp
       
     9 case class STAR(r: Rexp) extends Rexp
       
    10 case class NOT(r: Rexp) extends Rexp
       
    11 
       
    12 
       
    13 // some convenience for typing in regular expressions
       
    14 def charlist2rexp(s : List[Char]) : Rexp = s match {
       
    15   case Nil => EMPTY
       
    16   case c::Nil => CHAR(c)
       
    17   case c::s => SEQ(CHAR(c), charlist2rexp(s))
       
    18 }
       
    19 implicit def string2rexp(s : String) : Rexp = charlist2rexp(s.toList)
       
    20 
       
    21 
       
    22 // nullable function: tests whether the regular 
       
    23 // expression can recognise the empty string
       
    24 def nullable (r: Rexp) : Boolean = r match {
       
    25   case NULL => false
       
    26   case EMPTY => true
       
    27   case CHAR(_) => false
       
    28   case ALT(r1, r2) => nullable(r1) || nullable(r2)
       
    29   case SEQ(r1, r2) => nullable(r1) && nullable(r2)
       
    30   case STAR(_) => true
       
    31   case NOT(r) => !(nullable(r))
       
    32 }
       
    33 
       
    34 // tests whether a regular expression 
       
    35 // recognises nothing
       
    36 def zeroable (r: Rexp) : Boolean = r match {
       
    37   case NULL => true
       
    38   case EMPTY => false
       
    39   case CHAR(_) => false
       
    40   case ALT(r1, r2) => zeroable(r1) && zeroable(r2)
       
    41   case SEQ(r1, r2) => zeroable(r1) || zeroable(r2)
       
    42   case STAR(_) => false
       
    43   case NOT(r) => !(zeroable(r))
       
    44 }
       
    45 
       
    46 def starts_with (r: Rexp, c: Char) : Boolean = r match {
       
    47   case NULL => false
       
    48   case EMPTY => false
       
    49   case CHAR(d) => (c == d)
       
    50   case ALT(r1, r2) => starts_with(r1, c) || starts_with(r2, c)
       
    51   case SEQ(r1, r2) => if (nullable(r1)) (starts_with(r1, c) || starts_with(r2, c))
       
    52                       else starts_with(r1, c)
       
    53   case STAR(r) => starts_with(r, c)
       
    54   case NOT(r) => !(starts_with(r, c))
       
    55 }
       
    56 
       
    57 // derivative of a regular expression w.r.t. a character
       
    58 def der (c: Char, r: Rexp) : Rexp = r match {
       
    59   case NULL => NULL
       
    60   case EMPTY => NULL
       
    61   case CHAR(d) => if (c == d) EMPTY else NULL
       
    62   case ALT(r1, r2) => ALT(der(c, r1), der(c, r2))
       
    63   case SEQ(r1, r2) => 
       
    64     if (nullable(r1)) ALT(SEQ(der(c, r1), r2), der(c, r2))
       
    65     else SEQ(der(c, r1), r2)
       
    66   case STAR(r) => SEQ(der(c, r), STAR(r))
       
    67   case NOT(r) => NOT(der (c, r))
       
    68 }
       
    69 
       
    70 // derivative w.r.t. a string (iterates der)
       
    71 def ders (s: List[Char], r: Rexp) : Rexp = s match {
       
    72   case Nil => r
       
    73   case c::s => ders(s, der(c, r))
       
    74 }
       
    75 
       
    76 // main matcher function
       
    77 def matcher(r: Rexp, s: String) : Boolean = nullable(ders(s.toList, r))
       
    78 
       
    79 
       
    80 // regular expression for specifying 
       
    81 // ranges of characters
       
    82 def RANGE(s : List[Char]) : Rexp = s match {
       
    83   case Nil => NULL
       
    84   case c::Nil => CHAR(c)
       
    85   case c::s => ALT(CHAR(c), RANGE(s))
       
    86 }
       
    87 
       
    88 //one or more
       
    89 def PLUS(r: Rexp) = SEQ(r, STAR(r))
       
    90 
       
    91 
       
    92 //some regular expressions
       
    93 val LOWERCASE = RANGE("abcdefghijklmnopqrstuvwxyz".toList)
       
    94 val UPPERCASE = RANGE("ABCDEFGHIJKLMNOPQRSTUVWXYZ".toList)
       
    95 val LETTER = ALT(LOWERCASE, UPPERCASE)
       
    96 val DIGITS = RANGE("0123456789".toList)
       
    97 val NONZERODIGITS = RANGE("123456789".toList)
       
    98 
       
    99 val IDENT = SEQ(LETTER, STAR(ALT(LETTER,DIGITS)))
       
   100 val NUMBER = ALT(SEQ(NONZERODIGITS, STAR(DIGITS)), "0")
       
   101 val WHITESPACE = RANGE(" \n".toList)
       
   102 val SYMBOLS = RANGE("/*".toList)
       
   103 
       
   104 val ALL = ALT(ALT(ALT(LETTER, DIGITS), WHITESPACE), SYMBOLS)
       
   105 
       
   106 val COMMENT = SEQ(SEQ("/*", NOT(SEQ(SEQ(STAR(ALL), "*/"), STAR(ALL)))), "*/")
       
   107 
       
   108 println(matcher(NUMBER, "0"))
       
   109 println(matcher(NUMBER, "01"))
       
   110 println(matcher(NUMBER, "123450"))
       
   111 
       
   112 println(matcher(SEQ(STAR("a"), STAR("b")), "bbaaa"))
       
   113 println(matcher(ALT(STAR("a"), STAR("b")), ""))
       
   114 println(matcher("abc", ""))
       
   115 println(matcher(STAR(ALT(EMPTY, "a")), ""))
       
   116 println(matcher(STAR(EMPTY), "a"))
       
   117 println(matcher("cab","cab"))
       
   118 println(matcher(STAR("a"),"aaa"))
       
   119 println(matcher("cab" ,"cab"))
       
   120 println(matcher(STAR("a"),"aaa"))
       
   121 
       
   122 println(matcher(COMMENT, "/* */"))
       
   123 println(matcher(COMMENT, "/* foobar comment */"))
       
   124 println(matcher(COMMENT, "/* test */ test */"))
       
   125 
       
   126 // an example list of regular expressions
       
   127 val regs: List[Rexp]=  List("if", "then", "else", "+", IDENT, NUMBER, COMMENT, WHITESPACE) 
       
   128 
       
   129 
       
   130 def error (s: String) = throw new IllegalArgumentException ("Could not lex " + s)
       
   131 
       
   132 def munch(r: Rexp, s: List[Char], t: List[Char]) : Option[(List[Char], List[Char])] = 
       
   133   if (zeroable(r)) None else s match {
       
   134   case Nil => if (nullable(r)) Some(Nil, t) else None
       
   135   case c::s if (zeroable(der (c, r)) && nullable(r)) => Some(c::s, t)
       
   136   //case c::s if (zeroable(der (c, r))) => None
       
   137   case c::s => munch(der (c, r), s, t ::: List(c))
       
   138 }
       
   139 
       
   140 
       
   141 def lex_one (regs: List[Rexp], s: List[Char]) : (List[Char], List[Char]) = {
       
   142  val somes = regs.map { munch(_, s, Nil) } .flatten
       
   143  if (somes == Nil) error(s.mkString) else (somes sortBy (_._1.length) head)
       
   144 }
       
   145 
       
   146 def lex_all (regs: List[Rexp], s: List[Char]) : List[String] = s match {
       
   147   case Nil => Nil
       
   148   case _ => lex_one(regs, s) match {
       
   149     case (rest, s) => s.mkString :: lex_all(regs, rest) 
       
   150   }
       
   151 }
       
   152 
       
   153 
       
   154 
       
   155 starts_with(der('/', COMMENT), '*')
       
   156 
       
   157 munch(COMMENT, "/*ifff if 34 */".toList, Nil)
       
   158 val COMMENT2 = NOT(SEQ(SEQ(STAR(ALL), "*/"), STAR(ALL)))
       
   159 
       
   160 der('a', COMMENT2)
       
   161 zeroable(der('a', COMMENT2))
       
   162 
       
   163 matcher(COMMENT2, "ifff if 34")
       
   164 munch(COMMENT2, "ifff if 34".toList, Nil)
       
   165 starts_with(COMMENT2, 'i')
       
   166 lex_all(regs, "ifff if 34".toList)
       
   167 lex_all(regs, "ifff $ if 34".toList)
       
   168