author | Christian Urban <christian.urban@kcl.ac.uk> |
Sun, 19 Oct 2025 09:44:04 +0200 | |
changeset 1011 | 31e011ce66e3 |
parent 981 | 14e5ae1fb541 |
permissions | -rw-r--r-- |
725 | 1 |
// A simple lexer inspired by work of Sulzmann & Lu |
2 |
//================================================== |
|
3 |
// |
|
827 | 4 |
// call with |
5 |
// |
|
6 |
// amm lex.sc |
|
7 |
// |
|
8 |
||
725 | 9 |
|
981 | 10 |
// regular expressions including recods |
11 |
// for extracting strings or tokens |
|
12 |
enum Rexp { |
|
13 |
case ZERO |
|
14 |
case ONE |
|
15 |
case CHAR(c: Char) |
|
16 |
case ALT(r1: Rexp, r2: Rexp) |
|
17 |
case SEQ(r1: Rexp, r2: Rexp) |
|
18 |
case STAR(r: Rexp) |
|
19 |
case RECD[A](label: A, r: Rexp) |
|
20 |
} |
|
21 |
import Rexp._ |
|
22 |
||
725 | 23 |
// values |
981 | 24 |
enum Val { |
25 |
case Empty |
|
26 |
case Chr(c: Char) |
|
27 |
case Sequ(v1: Val, v2: Val) |
|
28 |
case Left(v: Val) |
|
29 |
case Right(v: Val) |
|
30 |
case Stars(vs: List[Val]) |
|
31 |
case Rec[A](label: A, v: Val) |
|
32 |
} |
|
33 |
import Val._ |
|
34 |
||
725 | 35 |
// some convenience for typing in regular expressions |
981 | 36 |
import scala.language.implicitConversions |
725 | 37 |
|
38 |
def charlist2rexp(s : List[Char]): Rexp = s match { |
|
39 |
case Nil => ONE |
|
40 |
case c::Nil => CHAR(c) |
|
41 |
case c::s => SEQ(CHAR(c), charlist2rexp(s)) |
|
42 |
} |
|
959
64ec1884d860
updated and added pascal.while file
Christian Urban <christian.urban@kcl.ac.uk>
parents:
948
diff
changeset
|
43 |
|
64ec1884d860
updated and added pascal.while file
Christian Urban <christian.urban@kcl.ac.uk>
parents:
948
diff
changeset
|
44 |
given Conversion[String, Rexp] = (s => charlist2rexp(s.toList)) |
725 | 45 |
|
787 | 46 |
val HELLO : Rexp = "hello" |
47 |
||
936 | 48 |
extension (r: Rexp) { |
725 | 49 |
def | (s: Rexp) = ALT(r, s) |
50 |
def % = STAR(r) |
|
51 |
def ~ (s: Rexp) = SEQ(r, s) |
|
52 |
} |
|
53 |
||
787 | 54 |
val TEST = ("ab" | "ba").% |
55 |
||
725 | 56 |
def nullable(r: Rexp) : Boolean = r match { |
57 |
case ZERO => false |
|
58 |
case ONE => true |
|
59 |
case CHAR(_) => false |
|
60 |
case ALT(r1, r2) => nullable(r1) || nullable(r2) |
|
61 |
case SEQ(r1, r2) => nullable(r1) && nullable(r2) |
|
62 |
case STAR(_) => true |
|
63 |
case RECD(_, r1) => nullable(r1) |
|
64 |
} |
|
65 |
||
66 |
def der(c: Char, r: Rexp) : Rexp = r match { |
|
67 |
case ZERO => ZERO |
|
68 |
case ONE => ZERO |
|
69 |
case CHAR(d) => if (c == d) ONE else ZERO |
|
70 |
case ALT(r1, r2) => ALT(der(c, r1), der(c, r2)) |
|
71 |
case SEQ(r1, r2) => |
|
72 |
if (nullable(r1)) ALT(SEQ(der(c, r1), r2), der(c, r2)) |
|
73 |
else SEQ(der(c, r1), r2) |
|
74 |
case STAR(r) => SEQ(der(c, r), STAR(r)) |
|
75 |
case RECD(_, r1) => der(c, r1) |
|
76 |
} |
|
77 |
||
78 |
||
742 | 79 |
// extracts a string from a value |
725 | 80 |
def flatten(v: Val) : String = v match { |
81 |
case Empty => "" |
|
82 |
case Chr(c) => c.toString |
|
83 |
case Left(v) => flatten(v) |
|
84 |
case Right(v) => flatten(v) |
|
85 |
case Sequ(v1, v2) => flatten(v1) ++ flatten(v2) |
|
86 |
case Stars(vs) => vs.map(flatten).mkString |
|
87 |
case Rec(_, v) => flatten(v) |
|
88 |
} |
|
89 |
||
90 |
||
91 |
// extracts an environment from a value; |
|
92 |
// used for tokenising a string |
|
981 | 93 |
//import scala.reflect.ClassTag |
94 |
||
95 |
def env[A](v: Val) : List[(A, String)] = v match { |
|
725 | 96 |
case Empty => Nil |
97 |
case Chr(c) => Nil |
|
98 |
case Left(v) => env(v) |
|
99 |
case Right(v) => env(v) |
|
100 |
case Sequ(v1, v2) => env(v1) ::: env(v2) |
|
101 |
case Stars(vs) => vs.flatMap(env) |
|
981 | 102 |
case Rec[A](x, v) => (x, flatten(v))::env(v) |
725 | 103 |
} |
104 |
||
105 |
||
106 |
// The injection and mkeps part of the lexer |
|
107 |
//=========================================== |
|
108 |
||
981 | 109 |
// the pattern-matches are defined to be @unchecked |
110 |
// because they do not need to be defined for |
|
111 |
// all cases |
|
112 |
||
113 |
def mkeps(r: Rexp) : Val = (r: @unchecked) match { |
|
725 | 114 |
case ONE => Empty |
115 |
case ALT(r1, r2) => |
|
116 |
if (nullable(r1)) Left(mkeps(r1)) else Right(mkeps(r2)) |
|
117 |
case SEQ(r1, r2) => Sequ(mkeps(r1), mkeps(r2)) |
|
118 |
case STAR(r) => Stars(Nil) |
|
119 |
case RECD(x, r) => Rec(x, mkeps(r)) |
|
120 |
} |
|
121 |
||
981 | 122 |
def inj(r: Rexp, c: Char, v: Val) : Val = ((r, v) : @unchecked) match { |
725 | 123 |
case (STAR(r), Sequ(v1, Stars(vs))) => Stars(inj(r, c, v1)::vs) |
124 |
case (SEQ(r1, r2), Sequ(v1, v2)) => Sequ(inj(r1, c, v1), v2) |
|
125 |
case (SEQ(r1, r2), Left(Sequ(v1, v2))) => Sequ(inj(r1, c, v1), v2) |
|
126 |
case (SEQ(r1, r2), Right(v2)) => Sequ(mkeps(r1), inj(r2, c, v2)) |
|
127 |
case (ALT(r1, r2), Left(v1)) => Left(inj(r1, c, v1)) |
|
128 |
case (ALT(r1, r2), Right(v2)) => Right(inj(r2, c, v2)) |
|
129 |
case (CHAR(d), Empty) => Chr(c) |
|
130 |
case (RECD(x, r1), _) => Rec(x, inj(r1, c, v)) |
|
131 |
} |
|
132 |
||
844 | 133 |
// lexing functions without simplification |
786 | 134 |
def lex(r: Rexp, s: List[Char]) : Val = s match { |
135 |
case Nil => if (nullable(r)) mkeps(r) else |
|
136 |
{ throw new Exception("lexing error") } |
|
137 |
case c::cs => inj(r, c, lex(der(c, r), cs)) |
|
725 | 138 |
} |
139 |
||
981 | 140 |
|
725 | 141 |
|
852 | 142 |
println(lex(("ab" | "a") ~ (ONE | "b"), "ab".toList)) |
143 |
||
144 |
println(lex(STAR("aa" | "a"), "aaa".toList)) |
|
725 | 145 |
|
981 | 146 |
println(lex(STAR(STAR("a")), "aaa".toList)) |
147 |
||
148 |
||
725 | 149 |
// The Lexing Rules for the WHILE Language |
150 |
||
151 |
def PLUS(r: Rexp) = r ~ r.% |
|
152 |
||
153 |
def Range(s : List[Char]) : Rexp = s match { |
|
154 |
case Nil => ZERO |
|
155 |
case c::Nil => CHAR(c) |
|
156 |
case c::s => ALT(CHAR(c), Range(s)) |
|
157 |
} |
|
158 |
def RANGE(s: String) = Range(s.toList) |
|
159 |
||
800 | 160 |
val SYM = RANGE("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_") |
725 | 161 |
val DIGIT = RANGE("0123456789") |
162 |
val ID = SYM ~ (SYM | DIGIT).% |
|
163 |
val NUM = PLUS(DIGIT) |
|
164 |
val KEYWORD : Rexp = "skip" | "while" | "do" | "if" | "then" | "else" | "read" | "write" |
|
165 |
val SEMI: Rexp = ";" |
|
166 |
val OP: Rexp = ":=" | "=" | "-" | "+" | "*" | "!=" | "<" | ">" |
|
846 | 167 |
val WHITESPACE = PLUS(" " | "\n" | "\t" | "\r") |
787 | 168 |
val RPAREN: Rexp = "}" |
169 |
val LPAREN: Rexp = "{" |
|
725 | 170 |
val STRING: Rexp = "\"" ~ SYM.% ~ "\"" |
171 |
||
172 |
||
981 | 173 |
enum TAGS { |
174 |
case Key, Id, Op, Num, Semi, Str, Paren, Wht |
|
175 |
} |
|
176 |
import TAGS._ |
|
177 |
||
1011 | 178 |
|
981 | 179 |
extension (t: TAGS) { |
1011 | 180 |
def $ (r: Rexp) = RECD[TAGS](t, r) |
981 | 181 |
} |
725 | 182 |
|
981 | 183 |
def lexing(r: Rexp, s: String) = |
184 |
env[TAGS](lex(r, s.toList)) |
|
786 | 185 |
|
1011 | 186 |
val WHILE_REGS = ((Key $ KEYWORD) | |
187 |
(Id $ ID) | |
|
188 |
(Op $ OP) | |
|
189 |
(Num $ NUM) | |
|
190 |
(Semi $ SEMI) | |
|
191 |
(Str $ STRING) | |
|
192 |
(Paren $ (LPAREN | RPAREN)) | |
|
193 |
(Wht $ WHITESPACE)).% |
|
981 | 194 |
|
725 | 195 |
|
196 |
// Two Simple While Tests |
|
197 |
//======================== |
|
981 | 198 |
|
725 | 199 |
@main |
200 |
def small() = { |
|
201 |
||
786 | 202 |
val prog0 = """if""" |
725 | 203 |
println(s"test: $prog0") |
786 | 204 |
println(lexing(WHILE_REGS, prog0)) |
725 | 205 |
|
786 | 206 |
val prog1 = """iffoo""" |
725 | 207 |
println(s"test: $prog1") |
786 | 208 |
println(lexing(WHILE_REGS, prog1)) |
725 | 209 |
|
786 | 210 |
val prog2 = """read n; write n""" |
211 |
println(s"test: $prog2") |
|
212 |
println(lexing(WHILE_REGS, prog2)) |
|
725 | 213 |
} |
214 |