regexp3.scala
changeset 29 774007c4b1b3
parent 28 f63ba92a7d78
child 39 e5fb17c02508
equal deleted inserted replaced
28:f63ba92a7d78 29:774007c4b1b3
     1 // regular expressions
     1 
       
     2 // regular expressions including NOT
     2 abstract class Rexp
     3 abstract class Rexp
     3 
     4 
     4 case object NULL extends Rexp
     5 case object NULL extends Rexp
     5 case object EMPTY extends Rexp
     6 case object EMPTY extends Rexp
     6 case class CHAR(c: Char) extends Rexp
     7 case class CHAR(c: Char) extends Rexp
    24 def nullable (r: Rexp) : Boolean = r match {
    25 def nullable (r: Rexp) : Boolean = r match {
    25   case NULL => false
    26   case NULL => false
    26   case EMPTY => true
    27   case EMPTY => true
    27   case CHAR(_) => false
    28   case CHAR(_) => false
    28   case ALT(r1, r2) => nullable(r1) || nullable(r2)
    29   case ALT(r1, r2) => nullable(r1) || nullable(r2)
    29   case SEQ(r1, r2) => if (nullable(r1)) (zeroable(r1) && zeroable(r2)) else zeroable(r1)
    30   case SEQ(r1, r2) => nullable(r1) && nullable(r2)
    30   case STAR(_) => true
    31   case STAR(_) => true
    31   case NOT(r) => !(nullable(r))
    32   case NOT(r) => !(nullable(r))
    32 }
    33 }
    33 
    34 
    34 // tests whether a regular expression 
    35 // tests whether a regular expression 
    35 // recognises nothing
    36 // cannot recognise more
    36 def zeroable (r: Rexp) : Boolean = r match {
    37 def no_more (r: Rexp) : Boolean = r match {
    37   case NULL => true
    38   case NULL => true
    38   case EMPTY => false
    39   case EMPTY => false
    39   case CHAR(_) => false
    40   case CHAR(_) => false
    40   case ALT(r1, r2) => zeroable(r1) && zeroable(r2)
    41   case ALT(r1, r2) => no_more(r1) && no_more(r2)
    41   case SEQ(r1, r2) => if (nullable(r1)) (zeroable(r1) && zeroable(r2)) else zeroable(r1)
    42   case SEQ(r1, r2) => if (nullable(r1)) (no_more(r1) && no_more(r2)) else no_more(r1)
    42   //case SEQ(r1, r2) => zeroable(r1) || zeroable(r2)
       
    43   case STAR(_) => false
    43   case STAR(_) => false
    44   case NOT(r) => !(zeroable(r))
    44   case NOT(r) => !(no_more(r))
    45 }
    45 }
    46 
    46 
    47 
    47 
    48 // derivative of a regular expression w.r.t. a character
    48 // derivative of a regular expression w.r.t. a character
    49 def der (c: Char, r: Rexp) : Rexp = r match {
    49 def der (c: Char, r: Rexp) : Rexp = r match {
    56     else SEQ(der(c, r1), r2)
    56     else SEQ(der(c, r1), r2)
    57   case STAR(r) => SEQ(der(c, r), STAR(r))
    57   case STAR(r) => SEQ(der(c, r), STAR(r))
    58   case NOT(r) => NOT(der (c, r))
    58   case NOT(r) => NOT(der (c, r))
    59 }
    59 }
    60 
    60 
    61 // derivative w.r.t. a string (iterates der)
       
    62 def ders (s: List[Char], r: Rexp) : Rexp = s match {
       
    63   case Nil => r
       
    64   case c::s => ders(s, der(c, r))
       
    65 }
       
    66 
       
    67 // main matcher function
       
    68 def matcher(r: Rexp, s: String) : Boolean = nullable(ders(s.toList, r))
       
    69 
       
    70 
       
    71 // regular expression for specifying 
    61 // regular expression for specifying 
    72 // ranges of characters
    62 // ranges of characters
    73 def RANGE(s : List[Char]) : Rexp = s match {
    63 def RANGE(s : List[Char]) : Rexp = s match {
    74   case Nil => NULL
    64   case Nil => NULL
    75   case c::Nil => CHAR(c)
    65   case c::Nil => CHAR(c)
    76   case c::s => ALT(CHAR(c), RANGE(s))
    66   case c::s => ALT(CHAR(c), RANGE(s))
    77 }
    67 }
    78 
    68 
    79 //one or more
    69 // one or more
    80 def PLUS(r: Rexp) = SEQ(r, STAR(r))
    70 def PLUS(r: Rexp) = SEQ(r, STAR(r))
    81 
    71 
    82 
    72 // some regular expressions
    83 //some regular expressions
       
    84 val LOWERCASE = RANGE("abcdefghijklmnopqrstuvwxyz".toList)
    73 val LOWERCASE = RANGE("abcdefghijklmnopqrstuvwxyz".toList)
    85 val UPPERCASE = RANGE("ABCDEFGHIJKLMNOPQRSTUVWXYZ".toList)
    74 val UPPERCASE = RANGE("ABCDEFGHIJKLMNOPQRSTUVWXYZ".toList)
    86 val LETTER = ALT(LOWERCASE, UPPERCASE)
    75 val LETTER = ALT(LOWERCASE, UPPERCASE)
    87 val DIGITS = RANGE("0123456789".toList)
    76 val DIGIT = RANGE("0123456789".toList)
    88 val NONZERODIGITS = RANGE("123456789".toList)
    77 val NONZERODIGIT = RANGE("123456789".toList)
    89 
    78 
    90 val IDENT = SEQ(LETTER, STAR(ALT(LETTER,DIGITS)))
    79 val IDENT = SEQ(LETTER, STAR(ALT(LETTER,DIGIT)))
    91 val NUMBER = ALT(SEQ(NONZERODIGITS, STAR(DIGITS)), "0")
    80 val NUMBER = ALT(SEQ(NONZERODIGIT, STAR(DIGIT)), "0")
    92 val WHITESPACE = RANGE(" \n".toList)
    81 val WHITESPACE = RANGE(" \n".toList)
    93 val WHITESPACES = PLUS(WHITESPACE)
    82 val WHITESPACES = PLUS(WHITESPACE)
    94 
    83 
    95 val ALL = ALT(ALT(LETTER, DIGITS), WHITESPACE)
    84 val ALL = ALT(ALT(LETTER, DIGIT), WHITESPACE)
    96 
       
    97 val COMMENT = SEQ(SEQ("/*", NOT(SEQ(SEQ(STAR(ALL), "*/"), STAR(ALL)))), "*/")
    85 val COMMENT = SEQ(SEQ("/*", NOT(SEQ(SEQ(STAR(ALL), "*/"), STAR(ALL)))), "*/")
    98 
    86 
       
    87 
       
    88 // for classifying the strings that have been recognised
    99 abstract class Token
    89 abstract class Token
   100 
    90 
   101 case object T_WHITESPACE extends Token
    91 case object T_WHITESPACE extends Token
       
    92 case object T_COMMENT extends Token
   102 case class T_IDENT(s: String) extends Token
    93 case class T_IDENT(s: String) extends Token
   103 case class T_OP(s: String) extends Token
    94 case class T_OP(s: String) extends Token
   104 case class T_NUM(n: Int) extends Token
    95 case class T_NUM(n: Int) extends Token
   105 case class T_KEYWORD(s: String) extends Token
    96 case class T_KEYWORD(s: String) extends Token
   106 case object T_COMMENT extends Token
       
   107 
    97 
   108 
    98 
   109 // an example list of rules
    99 // an example list of syntactic rules
   110 type Rule = (Rexp, List[Char] => Token)
   100 type Rule = (Rexp, List[Char] => Token)
   111 
       
   112 val rules: List[Rule]= 
       
   113   List(("if", (s) => T_KEYWORD(s.mkString)),
       
   114        ("then", (s) => T_KEYWORD(s.mkString)),
       
   115        ("else", (s) => T_KEYWORD(s.mkString)),
       
   116        ("+", (s) => T_OP(s.mkString)),
       
   117        (IDENT, (s) => T_IDENT(s.mkString)),
       
   118        (NUMBER, (s) => T_NUM(s.mkString.toInt)),
       
   119        (WHITESPACES, (s) => T_WHITESPACE))
       
   120 
       
   121 
       
   122 def error (s: String) = throw new IllegalArgumentException ("Could not lex " + s)
       
   123 
       
   124 def munch(r: Rexp, action: List[Char] => Token, s: List[Char], t: List[Char]) : Option[(List[Char], Token)] = 
       
   125   s match {
       
   126     case Nil if (nullable(r)) => Some(Nil, action(t))
       
   127     case Nil => None
       
   128     case c::s if (zeroable(der (c, r)) && nullable(r)) => Some(c::s, action(t))
       
   129     case c::s if (zeroable(der (c, r))) => None
       
   130     case c::s => munch(der (c, r), action, s, t ::: List(c))
       
   131   }
       
   132 
       
   133 def lex_one (rs: List[Rule], s: List[Char]) : (List[Char], Token) = {
       
   134  val somes = rs.map { (r) => munch(r._1, r._2, s, Nil) } .flatten
       
   135  if (somes == Nil) error(s.mkString) else (somes sortBy (_._1.length) head)
       
   136 }
       
   137 
       
   138 def lex_all (rs: List[Rule], s: List[Char]) : List[Token] = s match {
       
   139   case Nil => Nil
       
   140   case _ => lex_one(rs, s) match {
       
   141     case (rest, t) => t :: lex_all(rs, rest) 
       
   142   }
       
   143 }
       
   144 
   101 
   145 val rules: List[Rule]= 
   102 val rules: List[Rule]= 
   146   List(("if", (s) => T_KEYWORD(s.mkString)),
   103   List(("if", (s) => T_KEYWORD(s.mkString)),
   147        ("then", (s) => T_KEYWORD(s.mkString)),
   104        ("then", (s) => T_KEYWORD(s.mkString)),
   148        ("else", (s) => T_KEYWORD(s.mkString)),
   105        ("else", (s) => T_KEYWORD(s.mkString)),
   150        (IDENT, (s) => T_IDENT(s.mkString)),
   107        (IDENT, (s) => T_IDENT(s.mkString)),
   151        (NUMBER, (s) => T_NUM(s.mkString.toInt)),
   108        (NUMBER, (s) => T_NUM(s.mkString.toInt)),
   152        (WHITESPACES, (s) => T_WHITESPACE),
   109        (WHITESPACES, (s) => T_WHITESPACE),
   153        (COMMENT, (s) => T_COMMENT))
   110        (COMMENT, (s) => T_COMMENT))
   154 
   111 
   155 println(lex_all(rules, "/*ifff if */ hhjj /*34 */".toList))
       
   156 
   112 
       
   113 def error (s: String) = throw new IllegalArgumentException ("Cannot tokenize: " + s)
   157 
   114 
   158 munch(COMMENT, (s) => T_COMMENT , "/*ifff if */ hhjj /*34 */".toList, Nil)
   115 def munch(r: Rexp, action: List[Char] => Token, s: List[Char], t: List[Char]) : Option[(List[Char], Token)] = 
   159 val COMMENT2 = NOT(SEQ(SEQ(STAR(ALL), "*/"), STAR(ALL)))
   116   s match {
       
   117     case Nil if (nullable(r)) => Some(Nil, action(t))
       
   118     case Nil => None
       
   119     case c::s if (no_more(der (c, r)) && nullable(r)) => Some(c::s, action(t))
       
   120     case c::s if (no_more(der (c, r))) => None
       
   121     case c::s => munch(der (c, r), action, s, t ::: List(c))
       
   122   }
   160 
   123 
   161 der('/', COMMENT)
   124 def one_token (rs: List[Rule], s: List[Char]) : (List[Char], Token) = {
   162 zeroable(der('/', COMMENT))
   125  val somes = rs.map { (r) => munch(r._1, r._2, s, Nil) } .flatten
   163 zeroable(der('a', COMMENT2))
   126  if (somes == Nil) error(s.mkString) else (somes sortBy (_._1.length) head)
       
   127 }
   164 
   128 
   165 matcher(COMMENT2, "ifff if 34")
   129 def tokenize (rs: List[Rule], s: List[Char]) : List[Token] = s match {
   166 munch(COMMENT2, "ifff if 34".toList, Nil)
   130   case Nil => Nil
   167 starts_with(COMMENT2, 'i')
   131   case _ => one_token(rs, s) match {
   168 lex_all(regs, "ifff if 34".toList)
   132     case (rest, token) => token :: tokenize(rs, rest) 
   169 lex_all(regs, "ifff $ if 34".toList)
   133   }
       
   134 }
   170 
   135 
   171 println(lex_all(rules, "/* if true then */ then 42 else +".toList))
   136 //examples
   172 println(lex_all(rules, "if true then then 42 else +".toList))
   137 println(tokenize(rules, "if true then then 42 else +".toList))
   173 println(lex_all(rules, "ifff if     34 34".toList))
   138 println(tokenize(rules, "if+true+then+then+42+else +".toList))
   174 println(lex_all(rules, "ifff $ if 34".toList))
   139 println(tokenize(rules, "ifff if     34 34".toList))
   175 
   140 println(tokenize(rules, "/*ifff if */ hhjj /*34 */".toList))
   176 
   141 println(tokenize(rules, "/* if true then */ then 42 else +".toList))
       
   142 //println(tokenize(rules, "ifff $ if 34".toList)) // causes an error because of the symbol $