regexp2.scala
changeset 29 774007c4b1b3
parent 28 f63ba92a7d78
equal deleted inserted replaced
28:f63ba92a7d78 29:774007c4b1b3
     1 // regular expressions
     1 
       
     2 // regular expressions including NOT
     2 abstract class Rexp
     3 abstract class Rexp
     3 
     4 
     4 case object NULL extends Rexp
     5 case object NULL extends Rexp
     5 case object EMPTY extends Rexp
     6 case object EMPTY extends Rexp
     6 case class CHAR(c: Char) extends Rexp
     7 case class CHAR(c: Char) extends Rexp
    30   case STAR(_) => true
    31   case STAR(_) => true
    31   case NOT(r) => !(nullable(r))
    32   case NOT(r) => !(nullable(r))
    32 }
    33 }
    33 
    34 
    34 // tests whether a regular expression 
    35 // tests whether a regular expression 
    35 // recognises nothing
    36 // cannot recognise more
    36 def zeroable (r: Rexp) : Boolean = r match {
    37 def no_more (r: Rexp) : Boolean = r match {
    37   case NULL => true
    38   case NULL => true
    38   case EMPTY => false
    39   case EMPTY => false
    39   case CHAR(_) => false
    40   case CHAR(_) => false
    40   case ALT(r1, r2) => zeroable(r1) && zeroable(r2)
    41   case ALT(r1, r2) => no_more(r1) && no_more(r2)
    41   case SEQ(r1, r2) => if (nullable(r1)) (zeroable(r1) && zeroable(r2)) else zeroable(r1)
    42   case SEQ(r1, r2) => if (nullable(r1)) (no_more(r1) && no_more(r2)) else no_more(r1)
    42   case STAR(_) => false
    43   case STAR(_) => false
    43   case NOT(r) => !(zeroable(r))
    44   case NOT(r) => !(no_more(r))
    44 }
    45 }
    45 
    46 
    46 
    47 
    47 // derivative of a regular expression w.r.t. a character
    48 // derivative of a regular expression w.r.t. a character
    48 def der (c: Char, r: Rexp) : Rexp = r match {
    49 def der (c: Char, r: Rexp) : Rexp = r match {
    54     if (nullable(r1)) ALT(SEQ(der(c, r1), r2), der(c, r2))
    55     if (nullable(r1)) ALT(SEQ(der(c, r1), r2), der(c, r2))
    55     else SEQ(der(c, r1), r2)
    56     else SEQ(der(c, r1), r2)
    56   case STAR(r) => SEQ(der(c, r), STAR(r))
    57   case STAR(r) => SEQ(der(c, r), STAR(r))
    57   case NOT(r) => NOT(der (c, r))
    58   case NOT(r) => NOT(der (c, r))
    58 }
    59 }
    59 
       
    60 // derivative w.r.t. a string (iterates der)
       
    61 def ders (s: List[Char], r: Rexp) : Rexp = s match {
       
    62   case Nil => r
       
    63   case c::s => ders(s, der(c, r))
       
    64 }
       
    65 
       
    66 // main matcher function
       
    67 def matcher(r: Rexp, s: String) : Boolean = nullable(ders(s.toList, r))
       
    68 
    60 
    69 
    61 
    70 // regular expression for specifying 
    62 // regular expression for specifying 
    71 // ranges of characters
    63 // ranges of characters
    72 def RANGE(s : List[Char]) : Rexp = s match {
    64 def RANGE(s : List[Char]) : Rexp = s match {
    81 
    73 
    82 //some regular expressions
    74 //some regular expressions
    83 val LOWERCASE = RANGE("abcdefghijklmnopqrstuvwxyz".toList)
    75 val LOWERCASE = RANGE("abcdefghijklmnopqrstuvwxyz".toList)
    84 val UPPERCASE = RANGE("ABCDEFGHIJKLMNOPQRSTUVWXYZ".toList)
    76 val UPPERCASE = RANGE("ABCDEFGHIJKLMNOPQRSTUVWXYZ".toList)
    85 val LETTER = ALT(LOWERCASE, UPPERCASE)
    77 val LETTER = ALT(LOWERCASE, UPPERCASE)
    86 val DIGITS = RANGE("0123456789".toList)
    78 val DIGIT = RANGE("0123456789".toList)
    87 val NONZERODIGITS = RANGE("123456789".toList)
    79 val NONZERODIGIT = RANGE("123456789".toList)
    88 
    80 
    89 val IDENT = SEQ(LETTER, STAR(ALT(LETTER,DIGITS)))
    81 val IDENT = SEQ(LETTER, STAR(ALT(LETTER,DIGIT)))
    90 val NUMBER = ALT(SEQ(NONZERODIGITS, STAR(DIGITS)), "0")
    82 val NUMBER = ALT(SEQ(NONZERODIGIT, STAR(DIGIT)), "0")
    91 val WHITESPACE = RANGE(" \n".toList)
    83 val WHITESPACE = RANGE(" \n".toList)
    92 val WHITESPACES = PLUS(WHITESPACE)
    84 val WHITESPACES = PLUS(WHITESPACE)
    93 
    85 
    94 val ALL = ALT(ALT(LETTER, DIGITS), WHITESPACE)
    86 val ALL = ALT(ALT(LETTER, DIGIT), WHITESPACE)
    95 
       
    96 val COMMENT = SEQ(SEQ("/*", NOT(SEQ(SEQ(STAR(ALL), "*/"), STAR(ALL)))), "*/")
    87 val COMMENT = SEQ(SEQ("/*", NOT(SEQ(SEQ(STAR(ALL), "*/"), STAR(ALL)))), "*/")
    97 
    88 
    98 println(matcher(NUMBER, "0"))
       
    99 println(matcher(NUMBER, "01"))
       
   100 println(matcher(NUMBER, "123450"))
       
   101 
       
   102 println(matcher(SEQ(STAR("a"), STAR("b")), "bbaaa"))
       
   103 println(matcher(ALT(STAR("a"), STAR("b")), ""))
       
   104 println(matcher("abc", ""))
       
   105 println(matcher(STAR(ALT(EMPTY, "a")), ""))
       
   106 println(matcher(STAR(EMPTY), "a"))
       
   107 println(matcher("cab","cab"))
       
   108 println(matcher(STAR("a"),"aaa"))
       
   109 println(matcher("cab" ,"cab"))
       
   110 println(matcher(STAR("a"),"aaa"))
       
   111 
       
   112 println(matcher(COMMENT, "/* */"))
       
   113 println(matcher(COMMENT, "/* foobar comment */"))
       
   114 println(matcher(COMMENT, "/* test */ test */"))
       
   115 
    89 
   116 // an example list of regular expressions
    90 // an example list of regular expressions
   117 val regs: List[Rexp]=  List("if", "then", "else", "+", IDENT, NUMBER, WHITESPACES) 
    91 val regs: List[Rexp]=  List("if", "then", "else", "+", IDENT, NUMBER, WHITESPACES, COMMENT) 
       
    92 
   118 
    93 
   119 def error (s: String) = throw new IllegalArgumentException ("Could not lex " + s)
    94 def error (s: String) = throw new IllegalArgumentException ("Could not lex " + s)
   120 
    95 
   121 def munch(r: Rexp, s: List[Char], t: List[Char]) : Option[(List[Char], List[Char])] = s match {
    96 def munch(r: Rexp, s: List[Char], t: List[Char]) : Option[(List[Char], List[Char])] = 
   122   case Nil if (nullable(r)) => Some(Nil, t)
    97   s match {
   123   case Nil => None
    98     case Nil if (nullable(r)) => Some(Nil, t)
   124   case c::s if (zeroable(der (c, r)) && nullable(r)) => Some(c::s, t)
    99     case Nil => None
   125   case c::s if (zeroable(der (c, r))) => None
   100     case c::s if (no_more(der (c, r)) && nullable(r)) => Some(c::s, t)
   126   case c::s => munch(der (c, r), s, t ::: List(c))
   101     case c::s if (no_more(der (c, r))) => None
   127 }
   102     case c::s => munch(der (c, r), s, t ::: List(c))
       
   103   }
   128 
   104 
   129 def lex_one (regs: List[Rexp], s: List[Char]) : (List[Char], List[Char]) = {
   105 def one_string (regs: List[Rexp], s: List[Char]) : (List[Char], List[Char]) = {
   130  val somes = regs.map { munch(_, s, Nil) } .flatten
   106  val somes = regs.map { munch(_, s, Nil) } .flatten
   131  if (somes == Nil) error(s.mkString) else (somes sortBy (_._1.length) head)
   107  if (somes == Nil) error(s.mkString) else (somes sortBy (_._1.length) head)
   132 }
   108 }
   133 
   109 
   134 def lex_all (regs: List[Rexp], s: List[Char]) : List[String] = s match {
   110 def tokenize (regs: List[Rexp], s: List[Char]) : List[String] = s match {
   135   case Nil => Nil
   111   case Nil => Nil
   136   case _ => lex_one(regs, s) match {
   112   case _ => one_string(regs, s) match {
   137     case (rest, s) => s.mkString :: lex_all(regs, rest) 
   113     case (rest, s) => s.mkString :: tokenize(regs, rest) 
   138   }
   114   }
   139 }
   115 }
   140 
   116 
   141 val regs: List[Rexp]=  List("if", "then", "else", "+", IDENT, NUMBER, WHITESPACES) 
   117 //examples
   142 
   118 println(tokenize(regs, "if true then then 42 else +".toList))
   143 println(lex_all(regs, "if true then 42 else +".toList))
   119 println(tokenize(regs, "if+true+then+then+42+else +".toList))
   144 println(lex_all(regs, "ifff if     34 34".toList))
   120 println(tokenize(regs, "ifff if     34 34".toList))
   145 println(lex_all(regs, "ifff +if+     34 34".toList))
   121 println(tokenize(regs, "/*ifff if */ hhjj /*34 */".toList))
   146 println(lex_all(regs, "1+x+3+4+foo".toList))
   122 println(tokenize(regs, "/* if true then */ then 42 else +".toList))
   147 println(lex_all(regs, "ifff $ if 34".toList))
   123 //println(tokenize(regs, "ifff $ if 34".toList)) // causes an error because of the symbol $
   148