|      1 // A tokeniser for the Fun language |         | 
|      2 //================================== |         | 
|      3 // |         | 
|      4 // call with  |         | 
|      5 // |         | 
|      6 //     amm fun_tokens.sc fact.fun |         | 
|      7 // |         | 
|      8 //     amm fun_tokens.sc defs.fun |         | 
|      9 // |         | 
|     10  |         | 
|     11  |         | 
|     12  |         | 
|     13 import scala.language.implicitConversions     |         | 
|     14 import scala.language.reflectiveCalls  |         | 
|     15  |         | 
|     16 abstract class Rexp  |         | 
|     17 case object ZERO extends Rexp |         | 
|     18 case object ONE extends Rexp |         | 
|     19 case class CHAR(c: Char) extends Rexp |         | 
|     20 case class ALT(r1: Rexp, r2: Rexp) extends Rexp  |         | 
|     21 case class SEQ(r1: Rexp, r2: Rexp) extends Rexp  |         | 
|     22 case class STAR(r: Rexp) extends Rexp  |         | 
|     23 case class RECD(x: String, r: Rexp) extends Rexp |         | 
|     24    |         | 
|     25 abstract class Val |         | 
|     26 case object Empty extends Val |         | 
|     27 case class Chr(c: Char) extends Val |         | 
|     28 case class Sequ(v1: Val, v2: Val) extends Val |         | 
|     29 case class Left(v: Val) extends Val |         | 
|     30 case class Right(v: Val) extends Val |         | 
|     31 case class Stars(vs: List[Val]) extends Val |         | 
|     32 case class Rec(x: String, v: Val) extends Val |         | 
|     33     |         | 
|     34 // some convenience for typing in regular expressions |         | 
|     35 def charlist2rexp(s : List[Char]): Rexp = s match { |         | 
|     36   case Nil => ONE |         | 
|     37   case c::Nil => CHAR(c) |         | 
|     38   case c::s => SEQ(CHAR(c), charlist2rexp(s)) |         | 
|     39 } |         | 
|     40 implicit def string2rexp(s : String) : Rexp =  |         | 
|     41   charlist2rexp(s.toList) |         | 
|     42  |         | 
|     43 implicit def RexpOps(r: Rexp) = new { |         | 
|     44   def | (s: Rexp) = ALT(r, s) |         | 
|     45   def % = STAR(r) |         | 
|     46   def ~ (s: Rexp) = SEQ(r, s) |         | 
|     47 } |         | 
|     48  |         | 
|     49 implicit def stringOps(s: String) = new { |         | 
|     50   def | (r: Rexp) = ALT(s, r) |         | 
|     51   def | (r: String) = ALT(s, r) |         | 
|     52   def % = STAR(s) |         | 
|     53   def ~ (r: Rexp) = SEQ(s, r) |         | 
|     54   def ~ (r: String) = SEQ(s, r) |         | 
|     55   def $ (r: Rexp) = RECD(s, r) |         | 
|     56 } |         | 
|     57  |         | 
|     58 def nullable (r: Rexp) : Boolean = r match { |         | 
|     59   case ZERO => false |         | 
|     60   case ONE => true |         | 
|     61   case CHAR(_) => false |         | 
|     62   case ALT(r1, r2) => nullable(r1) || nullable(r2) |         | 
|     63   case SEQ(r1, r2) => nullable(r1) && nullable(r2) |         | 
|     64   case STAR(_) => true |         | 
|     65   case RECD(_, r1) => nullable(r1) |         | 
|     66 } |         | 
|     67  |         | 
|     68 def der (c: Char, r: Rexp) : Rexp = r match { |         | 
|     69   case ZERO => ZERO |         | 
|     70   case ONE => ZERO |         | 
|     71   case CHAR(d) => if (c == d) ONE else ZERO |         | 
|     72   case ALT(r1, r2) => ALT(der(c, r1), der(c, r2)) |         | 
|     73   case SEQ(r1, r2) =>  |         | 
|     74     if (nullable(r1)) ALT(SEQ(der(c, r1), r2), der(c, r2)) |         | 
|     75     else SEQ(der(c, r1), r2) |         | 
|     76   case STAR(r) => SEQ(der(c, r), STAR(r)) |         | 
|     77   case RECD(_, r1) => der(c, r1) |         | 
|     78 } |         | 
|     79  |         | 
|     80  |         | 
|     81 // extracts a string from value |         | 
|     82 def flatten(v: Val) : String = v match { |         | 
|     83   case Empty => "" |         | 
|     84   case Chr(c) => c.toString |         | 
|     85   case Left(v) => flatten(v) |         | 
|     86   case Right(v) => flatten(v) |         | 
|     87   case Sequ(v1, v2) => flatten(v1) + flatten(v2) |         | 
|     88   case Stars(vs) => vs.map(flatten).mkString |         | 
|     89   case Rec(_, v) => flatten(v) |         | 
|     90 } |         | 
|     91  |         | 
|     92 // extracts an environment from a value; |         | 
|     93 // used for tokenise a string |         | 
|     94 def env(v: Val) : List[(String, String)] = v match { |         | 
|     95   case Empty => Nil |         | 
|     96   case Chr(c) => Nil |         | 
|     97   case Left(v) => env(v) |         | 
|     98   case Right(v) => env(v) |         | 
|     99   case Sequ(v1, v2) => env(v1) ::: env(v2) |         | 
|    100   case Stars(vs) => vs.flatMap(env) |         | 
|    101   case Rec(x, v) => (x, flatten(v))::env(v) |         | 
|    102 } |         | 
|    103  |         | 
|    104 // The Injection Part of the lexer |         | 
|    105  |         | 
|    106 def mkeps(r: Rexp) : Val = r match { |         | 
|    107   case ONE => Empty |         | 
|    108   case ALT(r1, r2) =>  |         | 
|    109     if (nullable(r1)) Left(mkeps(r1)) else Right(mkeps(r2)) |         | 
|    110   case SEQ(r1, r2) => Sequ(mkeps(r1), mkeps(r2)) |         | 
|    111   case STAR(r) => Stars(Nil) |         | 
|    112   case RECD(x, r) => Rec(x, mkeps(r)) |         | 
|    113 } |         | 
|    114  |         | 
|    115 def inj(r: Rexp, c: Char, v: Val) : Val = (r, v) match { |         | 
|    116   case (STAR(r), Sequ(v1, Stars(vs))) => Stars(inj(r, c, v1)::vs) |         | 
|    117   case (SEQ(r1, r2), Sequ(v1, v2)) => Sequ(inj(r1, c, v1), v2) |         | 
|    118   case (SEQ(r1, r2), Left(Sequ(v1, v2))) => Sequ(inj(r1, c, v1), v2) |         | 
|    119   case (SEQ(r1, r2), Right(v2)) => Sequ(mkeps(r1), inj(r2, c, v2)) |         | 
|    120   case (ALT(r1, r2), Left(v1)) => Left(inj(r1, c, v1)) |         | 
|    121   case (ALT(r1, r2), Right(v2)) => Right(inj(r2, c, v2)) |         | 
|    122   case (CHAR(d), Empty) => Chr(c)  |         | 
|    123   case (RECD(x, r1), _) => Rec(x, inj(r1, c, v)) |         | 
|    124   case _ => { println ("Injection error") ; sys.exit(-1) }  |         | 
|    125 } |         | 
|    126  |         | 
|    127 // some "rectification" functions for simplification |         | 
|    128 def F_ID(v: Val): Val = v |         | 
|    129 def F_RIGHT(f: Val => Val) = (v:Val) => Right(f(v)) |         | 
|    130 def F_LEFT(f: Val => Val) = (v:Val) => Left(f(v)) |         | 
|    131 def F_ALT(f1: Val => Val, f2: Val => Val) = (v:Val) => v match { |         | 
|    132   case Right(v) => Right(f2(v)) |         | 
|    133   case Left(v) => Left(f1(v)) |         | 
|    134 } |         | 
|    135 def F_SEQ(f1: Val => Val, f2: Val => Val) = (v:Val) => v match { |         | 
|    136   case Sequ(v1, v2) => Sequ(f1(v1), f2(v2)) |         | 
|    137 } |         | 
|    138 def F_SEQ_Empty1(f1: Val => Val, f2: Val => Val) =  |         | 
|    139   (v:Val) => Sequ(f1(Empty), f2(v)) |         | 
|    140 def F_SEQ_Empty2(f1: Val => Val, f2: Val => Val) =  |         | 
|    141   (v:Val) => Sequ(f1(v), f2(Empty)) |         | 
|    142 def F_RECD(f: Val => Val) = (v:Val) => v match { |         | 
|    143   case Rec(x, v) => Rec(x, f(v)) |         | 
|    144 } |         | 
|    145 def F_ERROR(v: Val): Val = throw new Exception("error") |         | 
|    146  |         | 
|    147 def simp(r: Rexp): (Rexp, Val => Val) = r match { |         | 
|    148   case ALT(r1, r2) => { |         | 
|    149     val (r1s, f1s) = simp(r1) |         | 
|    150     val (r2s, f2s) = simp(r2) |         | 
|    151     (r1s, r2s) match { |         | 
|    152       case (ZERO, _) => (r2s, F_RIGHT(f2s)) |         | 
|    153       case (_, ZERO) => (r1s, F_LEFT(f1s)) |         | 
|    154       case _ => if (r1s == r2s) (r1s, F_LEFT(f1s)) |         | 
|    155                 else (ALT (r1s, r2s), F_ALT(f1s, f2s))  |         | 
|    156     } |         | 
|    157   } |         | 
|    158   case SEQ(r1, r2) => { |         | 
|    159     val (r1s, f1s) = simp(r1) |         | 
|    160     val (r2s, f2s) = simp(r2) |         | 
|    161     (r1s, r2s) match { |         | 
|    162       case (ZERO, _) => (ZERO, F_ERROR) |         | 
|    163       case (_, ZERO) => (ZERO, F_ERROR) |         | 
|    164       case (ONE, _) => (r2s, F_SEQ_Empty1(f1s, f2s)) |         | 
|    165       case (_, ONE) => (r1s, F_SEQ_Empty2(f1s, f2s)) |         | 
|    166       case _ => (SEQ(r1s,r2s), F_SEQ(f1s, f2s)) |         | 
|    167     } |         | 
|    168   } |         | 
|    169   case RECD(x, r1) => { |         | 
|    170     val (r1s, f1s) = simp(r1) |         | 
|    171     (RECD(x, r1s), F_RECD(f1s)) |         | 
|    172   } |         | 
|    173   case r => (r, F_ID) |         | 
|    174 } |         | 
|    175  |         | 
|    176 // lexing functions including simplification |         | 
|    177 def lex_simp(r: Rexp, s: List[Char]) : Val = s match { |         | 
|    178   case Nil => if (nullable(r)) mkeps(r) else { println ("Lexing Error") ; sys.exit(-1) }  |         | 
|    179   case c::cs => { |         | 
|    180     val (r_simp, f_simp) = simp(der(c, r)) |         | 
|    181     inj(r, c, f_simp(lex_simp(r_simp, cs))) |         | 
|    182   } |         | 
|    183 } |         | 
|    184  |         | 
|    185 def lexing_simp(r: Rexp, s: String) = env(lex_simp(r, s.toList)) |         | 
|    186  |         | 
|    187  |         | 
|    188 // The Lexing Rules for the Fun Language |         | 
|    189  |         | 
|    190 def PLUS(r: Rexp) = r ~ r.% |         | 
|    191 def OPT(r: Rexp) = r | ONE |         | 
|    192  |         | 
|    193 val SYM = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | "j" | "k" |  |         | 
|    194           "l" | "m" | "n" | "o" | "p" | "q" | "r" | "s" | "t" | "u" | "v" |  |         | 
|    195           "w" | "x" | "y" | "z" | "A" | "B" | "C" | "D" |"E" | "F" | "G" | |         | 
|    196           "H" | "I" | "J" | "K" |"L" | "M" | "N" | |         | 
|    197           "O" | "P" | "Q" | "R" |"S" | "T" | "U" | |         | 
|    198           "V" | "W" | "X" | "Y" | "Z" | "_" | ":" |         | 
|    199 val DIGIT = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" |         | 
|    200 val ID = SYM ~ (SYM | DIGIT).%  |         | 
|    201 val NUM = PLUS(DIGIT) |         | 
|    202 val FNUM = OPT("-") ~ NUM ~ "." ~ NUM  |         | 
|    203 val KEYWORD : Rexp = "if" | "then" | "else" | "def" | "val" |         | 
|    204 val TYPE : Rexp = "Void" | "Int" | "Double"  |         | 
|    205 val SEMI: Rexp = ";" |         | 
|    206 val COLON: Rexp = ":" |         | 
|    207 val COMMA: Rexp = "," |         | 
|    208 val OP: Rexp = "=" | "==" | "-" | "+" | "*" | "!=" | "<" | ">" | "<=" | ">=" | "%" | "/" |         | 
|    209 val WHITESPACE = PLUS(" " | "\n" | "\t" | "\r") |         | 
|    210 val RPAREN: Rexp = ")" | "}" |         | 
|    211 val LPAREN: Rexp = "(" | "{" |         | 
|    212 val ALL = SYM | DIGIT | OP | " " | ":" | ";" | "-" | "." | "\"" | "=" | "," | "(" | ")" | "{" | "}" |         | 
|    213 val ALL2 = ALL | "\n" |         | 
|    214 val COMMENT = ("/*" ~ ALL2.% ~ "*/") | ("//" ~ ALL.% ~ "\n") |         | 
|    215  |         | 
|    216 val CHR :Rexp = "'" ~ (ALL | "\\n") ~ "'"  |         | 
|    217  |         | 
|    218  |         | 
|    219 val FUN_REGS = (("k" $ KEYWORD) |  |         | 
|    220                 ("t" $ TYPE) | |         | 
|    221                 ("i" $ ID) |  |         | 
|    222                 ("ch" $ CHR) |  |         | 
|    223                 ("o" $ OP) |  |         | 
|    224                 ("n" $ NUM) |  |         | 
|    225                 ("f" $ FNUM) |  |         | 
|    226                 ("s" $ SEMI) |  |         | 
|    227                 ("co" $ COLON) | |         | 
|    228                 ("c" $ COMMA) | |         | 
|    229                 ("pl" $ LPAREN) | |         | 
|    230                 ("pr" $ RPAREN) | |         | 
|    231                 ("w" $ (WHITESPACE | COMMENT))).% |         | 
|    232  |         | 
|    233  |         | 
|    234  |         | 
|    235 // The tokens for the Fun language |         | 
|    236  |         | 
|    237 abstract class Token extends Serializable  |         | 
|    238 case object T_SEMI extends Token |         | 
|    239 case object T_COMMA extends Token |         | 
|    240 case object T_COLON extends Token |         | 
|    241 case object T_LPAREN extends Token |         | 
|    242 case object T_RPAREN extends Token |         | 
|    243 case class T_ID(s: String) extends Token |         | 
|    244 case class T_FID(s: String) extends Token |         | 
|    245 case class T_OP(s: String) extends Token |         | 
|    246 case class T_NUM(n: Int) extends Token |         | 
|    247 case class T_FNUM(x: Double) extends Token |         | 
|    248 case class T_KWD(s: String) extends Token |         | 
|    249 case class T_TY(s: String) extends Token |         | 
|    250 case class T_CHR(i: Int) extends Token |         | 
|    251  |         | 
|    252 val token : PartialFunction[(String, String), Token] = { |         | 
|    253   case ("k", s) => T_KWD(s) |         | 
|    254   case ("t", s) => T_TY(s) |         | 
|    255   case ("i", s) => T_ID(s) |         | 
|    256   case ("o", s) => T_OP(s) |         | 
|    257   case ("n", s) => T_NUM(s.toInt) |         | 
|    258   case ("ch", s) => if (s == "'\\n'") T_CHR(10) else T_CHR(s(1).toInt) |         | 
|    259   case ("f", s) => T_FNUM(s.toDouble)  |         | 
|    260   case ("s", _) => T_SEMI |         | 
|    261   case ("c", _) => T_COMMA |         | 
|    262   case ("co", _) => T_COLON |         | 
|    263   case ("pl", _) => T_LPAREN |         | 
|    264   case ("pr", _) => T_RPAREN |         | 
|    265 } |         | 
|    266  |         | 
|    267  |         | 
|    268 def tokenise(s: String) : List[Token] = { |         | 
|    269   val tks = lexing_simp(FUN_REGS, s).collect(token) |         | 
|    270   if (tks.length != 0) tks |         | 
|    271   else { println (s"Tokenise Error") ; sys.exit(-1) }      |         | 
|    272 } |         | 
|    273  |         | 
|    274 //import ammonite.ops._ |         | 
|    275  |         | 
|    276 //@doc("Tokenising a file.") |         | 
|    277 @main |         | 
|    278 def main(fname: String) = { |         | 
|    279   println(tokenise(os.read(os.pwd / fname))) |         | 
|    280 } |         |