solutions/cw3/lexer.sc
changeset 961 c0600f8b6427
parent 959 64ec1884d860
equal deleted inserted replaced
960:c7009356ddd8 961:c0600f8b6427
     1 // Lexer from CW2
     1 // Lexer from CW2
     2 //================
     2 //================
     3 
     3 
       
     4 //> using toolkit 0.4.0
       
     5 //> using file project.sc
       
     6 import project.*
     4 
     7 
     5 // Rexp
     8 // Rexp
     6 abstract class Rexp
     9 abstract class Rexp
     7 case object ZERO extends Rexp
    10 case object ZERO extends Rexp
     8 case object ONE extends Rexp
    11 case object ONE extends Rexp
     9 case class CHAR(c: Char) extends Rexp
    12 case class CHAR(c: Char) extends Rexp
    10 case class ALT(r1: Rexp, r2: Rexp) extends Rexp 
    13 case class ALT(r1: Rexp, r2: Rexp) extends Rexp
    11 case class SEQ(r1: Rexp, r2: Rexp) extends Rexp 
    14 case class SEQ(r1: Rexp, r2: Rexp) extends Rexp
    12 case class STAR(r: Rexp) extends Rexp 
    15 case class STAR(r: Rexp) extends Rexp
    13 case class RECD(x: String, r: Rexp) extends Rexp
    16 case class RECD(x: String, r: Rexp) extends Rexp
    14 
    17 
    15 case class RANGE(s: Set[Char]) extends Rexp
    18 case class RANGE(s: Set[Char]) extends Rexp
    16 case class PLUS(r: Rexp) extends Rexp
    19 case class PLUS(r: Rexp) extends Rexp
    17 case class OPTIONAL(r: Rexp) extends Rexp
    20 case class OPTIONAL(r: Rexp) extends Rexp
    18 case class NTIMES(r: Rexp, n: Int) extends Rexp 
    21 case class NTIMES(r: Rexp, n: Int) extends Rexp
    19 
    22 
    20 // Values
    23 // Values
    21 abstract class Val
    24 abstract class Val
    22 case object Empty extends Val
    25 case object Empty extends Val
    23 case class Chr(c: Char) extends Val
    26 case class Chr(c: Char) extends Val
    33   case Nil => ONE
    36   case Nil => ONE
    34   case c::Nil => CHAR(c)
    37   case c::Nil => CHAR(c)
    35   case c::s => SEQ(CHAR(c), charlist2rexp(s))
    38   case c::s => SEQ(CHAR(c), charlist2rexp(s))
    36 }
    39 }
    37 
    40 
    38 implicit def string2rexp(s : String) : Rexp = 
    41 implicit def string2rexp(s : String) : Rexp =
    39   charlist2rexp(s.toList)
    42   charlist2rexp(s.toList)
    40 
    43 
    41 extension (r: Rexp) {
    44 extension (r: Rexp) {
    42   def | (s: Rexp) = ALT(r, s)
    45   def | (s: Rexp) = ALT(r, s)
    43   def % = STAR(r)
    46   def % = STAR(r)
    48   def | (r: Rexp) = ALT(s, r)
    51   def | (r: Rexp) = ALT(s, r)
    49   def | (r: String) = ALT(s, r)
    52   def | (r: String) = ALT(s, r)
    50   def % = STAR(s)
    53   def % = STAR(s)
    51   def ~ (r: Rexp) = SEQ(s, r)
    54   def ~ (r: Rexp) = SEQ(s, r)
    52   def ~ (r: String) = SEQ(s, r)
    55   def ~ (r: String) = SEQ(s, r)
    53   def $ (r: Rexp) = RECD(s, r)
    56   infix def $ (r: Rexp) = RECD(s, r)
    54 }
    57 }
    55 
    58 
    56 // nullable
    59 // nullable
    57 def nullable(r: Rexp) : Boolean = r match {
    60 def nullable(r: Rexp) : Boolean = r match {
    58   case ZERO => false
    61   case ZERO => false
    73 def der(c: Char, r: Rexp) : Rexp = r match {
    76 def der(c: Char, r: Rexp) : Rexp = r match {
    74   case ZERO => ZERO
    77   case ZERO => ZERO
    75   case ONE => ZERO
    78   case ONE => ZERO
    76   case CHAR(d) => if (c == d) ONE else ZERO
    79   case CHAR(d) => if (c == d) ONE else ZERO
    77   case ALT(r1, r2) => ALT(der(c, r1), der(c, r2))
    80   case ALT(r1, r2) => ALT(der(c, r1), der(c, r2))
    78   case SEQ(r1, r2) => 
    81   case SEQ(r1, r2) =>
    79     if (nullable(r1)) ALT(SEQ(der(c, r1), r2), der(c, r2))
    82     if (nullable(r1)) ALT(SEQ(der(c, r1), r2), der(c, r2))
    80     else SEQ(der(c, r1), r2)
    83     else SEQ(der(c, r1), r2)
    81   case STAR(r) => SEQ(der(c, r), STAR(r))
    84   case STAR(r) => SEQ(der(c, r), STAR(r))
    82 
    85 
    83   case RECD(_, r1) => der(c, r1)
    86   case RECD(_, r1) => der(c, r1)
    84   case RANGE(s) => if (s.contains(c)) ONE else ZERO 
    87   case RANGE(s) => if (s.contains(c)) ONE else ZERO
    85   case PLUS(r1) => SEQ(der(c, r1), STAR(r1))
    88   case PLUS(r1) => SEQ(der(c, r1), STAR(r1))
    86   case OPTIONAL(r1) => der(c, r1)
    89   case OPTIONAL(r1) => der(c, r1)
    87   case NTIMES(r, i) => 
    90   case NTIMES(r, i) =>
    88     if (i == 0) ZERO else SEQ(der(c, r), NTIMES(r, i - 1))
    91     if (i == 0) ZERO else SEQ(der(c, r), NTIMES(r, i - 1))
    89 }
    92 }
    90 
    93 
    91 // Flatten
    94 // Flatten
    92 def flatten(v: Val) : String = v match {
    95 def flatten(v: Val) : String = v match {
   111 }
   114 }
   112 
   115 
   113 // Mkeps
   116 // Mkeps
   114 def mkeps(r: Rexp) : Val = r match {
   117 def mkeps(r: Rexp) : Val = r match {
   115   case ONE => Empty
   118   case ONE => Empty
   116   case ALT(r1, r2) => 
   119   case ALT(r1, r2) =>
   117     if (nullable(r1)) Left(mkeps(r1)) else Right(mkeps(r2))
   120     if (nullable(r1)) Left(mkeps(r1)) else Right(mkeps(r2))
   118   case SEQ(r1, r2) => Sequ(mkeps(r1), mkeps(r2))
   121   case SEQ(r1, r2) => Sequ(mkeps(r1), mkeps(r2))
   119   case STAR(r) => Stars(Nil)
   122   case STAR(r) => Stars(Nil)
   120   case RECD(x, r) => Rec(x, mkeps(r))
   123   case RECD(x, r) => Rec(x, mkeps(r))
   121 
   124 
   130   case (SEQ(r1, r2), Sequ(v1, v2)) => Sequ(inj(r1, c, v1), v2)
   133   case (SEQ(r1, r2), Sequ(v1, v2)) => Sequ(inj(r1, c, v1), v2)
   131   case (SEQ(r1, r2), Left(Sequ(v1, v2))) => Sequ(inj(r1, c, v1), v2)
   134   case (SEQ(r1, r2), Left(Sequ(v1, v2))) => Sequ(inj(r1, c, v1), v2)
   132   case (SEQ(r1, r2), Right(v2)) => Sequ(mkeps(r1), inj(r2, c, v2))
   135   case (SEQ(r1, r2), Right(v2)) => Sequ(mkeps(r1), inj(r2, c, v2))
   133   case (ALT(r1, r2), Left(v1)) => Left(inj(r1, c, v1))
   136   case (ALT(r1, r2), Left(v1)) => Left(inj(r1, c, v1))
   134   case (ALT(r1, r2), Right(v2)) => Right(inj(r2, c, v2))
   137   case (ALT(r1, r2), Right(v2)) => Right(inj(r2, c, v2))
   135   case (CHAR(d), Empty) => Chr(c) 
   138   case (CHAR(d), Empty) => Chr(c)
   136   case (RECD(x, r1), _) => Rec(x, inj(r1, c, v))
   139   case (RECD(x, r1), _) => Rec(x, inj(r1, c, v))
   137 
   140 
   138   case (RANGE(_), Empty) => Chr(c)
   141   case (RANGE(_), Empty) => Chr(c)
   139   case (PLUS(r), Sequ(v1, Stars(vs))) => Stars(inj(r, c, v1)::vs)
   142   case (PLUS(r), Sequ(v1, Stars(vs))) => Stars(inj(r, c, v1)::vs)
   140   case (OPTIONAL(r), v1) => Stars(List(inj(r, c, v1)))
   143   case (OPTIONAL(r), v1) => Stars(List(inj(r, c, v1)))
   150   case Left(v) => Left(f1(v))
   153   case Left(v) => Left(f1(v))
   151 }
   154 }
   152 def F_SEQ(f1: Val => Val, f2: Val => Val) = (v:Val) => v match {
   155 def F_SEQ(f1: Val => Val, f2: Val => Val) = (v:Val) => v match {
   153   case Sequ(v1, v2) => Sequ(f1(v1), f2(v2))
   156   case Sequ(v1, v2) => Sequ(f1(v1), f2(v2))
   154 }
   157 }
   155 def F_SEQ_Empty1(f1: Val => Val, f2: Val => Val) = 
   158 def F_SEQ_Empty1(f1: Val => Val, f2: Val => Val) =
   156   (v:Val) => Sequ(f1(Empty), f2(v))
   159   (v:Val) => Sequ(f1(Empty), f2(v))
   157 def F_SEQ_Empty2(f1: Val => Val, f2: Val => Val) = 
   160 def F_SEQ_Empty2(f1: Val => Val, f2: Val => Val) =
   158   (v:Val) => Sequ(f1(v), f2(Empty))
   161   (v:Val) => Sequ(f1(v), f2(Empty))
   159 def F_RECD(f: Val => Val) = (v:Val) => v match {
   162 def F_RECD(f: Val => Val) = (v:Val) => v match {
   160   case Rec(x, v) => Rec(x, f(v))
   163   case Rec(x, v) => Rec(x, f(v))
   161 }
   164 }
   162 def F_ERROR(v: Val): Val = throw new Exception("error")
   165 def F_ERROR(v: Val): Val = throw new Exception("error")
   168     val (r2s, f2s) = simp(r2)
   171     val (r2s, f2s) = simp(r2)
   169     (r1s, r2s) match {
   172     (r1s, r2s) match {
   170       case (ZERO, _) => (r2s, F_RIGHT(f2s))
   173       case (ZERO, _) => (r2s, F_RIGHT(f2s))
   171       case (_, ZERO) => (r1s, F_LEFT(f1s))
   174       case (_, ZERO) => (r1s, F_LEFT(f1s))
   172       case _ => if (r1s == r2s) (r1s, F_LEFT(f1s))
   175       case _ => if (r1s == r2s) (r1s, F_LEFT(f1s))
   173                 else (ALT (r1s, r2s), F_ALT(f1s, f2s)) 
   176                 else (ALT (r1s, r2s), F_ALT(f1s, f2s))
   174     }
   177     }
   175   }
   178   }
   176   case SEQ(r1, r2) => {
   179   case SEQ(r1, r2) => {
   177     val (r1s, f1s) = simp(r1)
   180     val (r1s, f1s) = simp(r1)
   178     val (r2s, f2s) = simp(r2)
   181     val (r2s, f2s) = simp(r2)
   187   case r => (r, F_ID)
   190   case r => (r, F_ID)
   188 }
   191 }
   189 
   192 
   190 // Lex
   193 // Lex
   191 def lex_simp(r: Rexp, s: List[Char]) : Val = s match {
   194 def lex_simp(r: Rexp, s: List[Char]) : Val = s match {
   192   case Nil => if (nullable(r)) mkeps(r) else 
   195   case Nil => if (nullable(r)) mkeps(r) else
   193     { throw new Exception("lexing error") } 
   196     { throw new Exception("lexing error") }
   194   case c::cs => {
   197   case c::cs => {
   195     val (r_simp, f_simp) = simp(der(c, r))
   198     val (r_simp, f_simp) = simp(der(c, r))
   196     inj(r, c, f_simp(lex_simp(r_simp, cs)))
   199     inj(r, c, f_simp(lex_simp(r_simp, cs)))
   197   }
   200   }
   198 }
   201 }
   199 
   202 
   200 def lexing_simp(r: Rexp, s: String) = env(lex_simp(r, s.toList))
   203 def lexing_simp(r: Rexp, s: String) = env(lex_simp(r, s.toList))
   201 
   204 
   202 // Language specific code
   205 // Language specific code
   203 val KEYWORD : Rexp = "while" | "if" | "then" | "else" | "do" | "for" | "to" | "true" | "false" | "read" | "write" | "skip" 
   206 val KEYWORD : Rexp = "while" | "if" | "then" | "else" | "do" | "for" | "to" | "true" | "false" | "read" | "write" | "skip"
   204 val OP : Rexp = "+" | "-" | "*" | "%" | "/" | "==" | "!=" | ">" | "<" | ">=" | "<=" | ":=" | "&&" | "||"
   207 val OP : Rexp = "+" | "-" | "*" | "%" | "/" | "==" | "!=" | ">" | "<" | ">=" | "<=" | ":=" | "&&" | "||"
   205 val LET: Rexp = RANGE(('A' to 'Z').toSet ++ ('a' to 'z').toSet)
   208 val LET: Rexp = RANGE(('A' to 'Z').toSet ++ ('a' to 'z').toSet)
   206 val SYM : Rexp = RANGE(Set('.', '_', '>', '<', '=', ';', ',', ':', ')', '('))
   209 val SYM : Rexp = RANGE(Set('.', '_', '>', '<', '=', ';', ',', ':', ')', '('))
   207 val PARENS : Rexp = "(" | "{" | ")" | "}"
   210 val PARENS : Rexp = "(" | "{" | ")" | "}"
   208 val SEMI : Rexp = ";"
   211 val SEMI : Rexp = ";"
   211 val DIGIT1 : Rexp = RANGE(('1' to '9').toSet)
   214 val DIGIT1 : Rexp = RANGE(('1' to '9').toSet)
   212 val STRING : Rexp = "\"" ~ (LET | SYM | DIGIT | " " | "\\n").% ~ "\""
   215 val STRING : Rexp = "\"" ~ (LET | SYM | DIGIT | " " | "\\n").% ~ "\""
   213 val ID : Rexp = LET ~ (LET | "_" | DIGIT).%
   216 val ID : Rexp = LET ~ (LET | "_" | DIGIT).%
   214 val NUM : Rexp = "0" | (DIGIT1 ~ DIGIT.%)
   217 val NUM : Rexp = "0" | (DIGIT1 ~ DIGIT.%)
   215 val EOL : Rexp = "\n" | "\r\n"
   218 val EOL : Rexp = "\n" | "\r\n"
   216 val COMMENT : Rexp = "//" ~ (LET | SYM | PARENS | " " | DIGIT).% ~ EOL 
   219 val COMMENT : Rexp = "//" ~ (LET | SYM | PARENS | " " | DIGIT).% ~ EOL
   217 
   220 
   218 val WHILE_REGS = (("k" $ KEYWORD) | 
   221 val WHILE_REGS = (("k" $ KEYWORD) |
   219                   ("o" $ OP) | 
   222                   ("o" $ OP) |
   220                   ("str" $ STRING) |
   223                   ("str" $ STRING) |
   221                   ("p" $ PARENS) |
   224                   ("p" $ PARENS) |
   222                   ("s" $ SEMI) | 
   225                   ("s" $ SEMI) |
   223                   ("w" $ WHITESPACE) | 
   226                   ("w" $ WHITESPACE) |
   224                   ("i" $ ID) | 
   227                   ("i" $ ID) |
   225                   ("n" $ NUM) |
   228                   ("n" $ NUM) |
   226 		  ("c" $ COMMENT)).%
   229                   ("c" $ COMMENT)).%
   227 
   230 
   228 
   231 
   229 
   232 
       
   233 
       
   234 
       
   235 def escapedChar(ch: Char): String = ch match {
       
   236   case '\b' => "\\b"
       
   237   case '\t' => "\\t"
       
   238   case '\n' => "\\n"
       
   239   case '\f' => "\\f"
       
   240   case '\r' => "\\r"
       
   241   case '"'  => "\\\""
       
   242   case '\'' => "\\\'"
       
   243   case '\\' => "\\\\"
       
   244   case _    => if (ch.isControl) "\\0" + Integer.toOctalString(ch.toInt)
       
   245                else              String.valueOf(ch)
       
   246 }
       
   247 
       
   248 def esc(s: String): String = "\"" + escapeImpl(s) + "\""
       
   249 def escapeImpl(s: String): String = s.flatMap(escapedChar)
       
   250 
       
   251 /*
   230 def esc(raw: String): String = {
   252 def esc(raw: String): String = {
   231   import scala.reflect.runtime.universe._
   253   import scala.reflect.runtime.universe._
   232   Literal(Constant(raw)).toString
   254   Literal(Constant(raw)).toString
   233 }
   255 }
       
   256 */
   234 
   257 
   235 def escape(tks: List[(String, String)]) =
   258 def escape(tks: List[(String, String)]) =
   236   tks.map{ case (s1, s2) => (esc(s1), esc(s2))}
   259   tks.map{ case (s1, s2) => (s1, s2)}
   237 
   260 
   238 
   261 
   239 // Tokens
   262 // Tokens
   240 abstract class Token extends Serializable 
   263 abstract class Token extends Serializable
   241 case class T_KEYWORD(s: String) extends Token
   264 case class T_KEYWORD(s: String) extends Token
   242 case class T_OP(s: String) extends Token
   265 case class T_OP(s: String) extends Token
   243 case class T_STRING(s: String) extends Token
   266 case class T_STRING(s: String) extends Token
   244 case class T_PAREN(s: String) extends Token
   267 case class T_PAREN(s: String) extends Token
   245 case object T_SEMI extends Token
   268 case object T_SEMI extends Token
   255   case ("i", s) => T_ID(s)
   278   case ("i", s) => T_ID(s)
   256   case ("n", s) => T_NUM(s.toInt)
   279   case ("n", s) => T_NUM(s.toInt)
   257 }
   280 }
   258 
   281 
   259 // Tokenise
   282 // Tokenise
   260 def tokenise(s: String) = //: List[Token] = 
   283 def tokenise(s: String) = //: List[Token] =
   261   escape(lexing_simp(WHILE_REGS, s)).filter{p => p._1 != "\"w\""}//.collect(token)
   284   escape(lexing_simp(WHILE_REGS, s)).collect(token)
   262 
   285 
   263 
   286 
   264 
   287 
   265 
   288 
   266 println(tokenise(os.read(os.pwd / "primes.while")))
   289 println(tokenise(os.read(os.pwd / "primes.while")))
   267