author | Christian Urban <christian.urban@kcl.ac.uk> |
Sun, 14 Sep 2025 12:59:23 +0100 | |
changeset 984 | 32eead4cd30e |
parent 981 | 14e5ae1fb541 |
permissions | -rw-r--r-- |
725 | 1 |
// A simple lexer inspired by work of Sulzmann & Lu |
2 |
//================================================== |
|
3 |
// |
|
827 | 4 |
// call with |
5 |
// |
|
6 |
// amm lex.sc |
|
7 |
// |
|
8 |
||
725 | 9 |
|
981 | 10 |
// regular expressions including recods |
11 |
// for extracting strings or tokens |
|
12 |
enum Rexp { |
|
13 |
case ZERO |
|
14 |
case ONE |
|
15 |
case CHAR(c: Char) |
|
16 |
case ALT(r1: Rexp, r2: Rexp) |
|
17 |
case SEQ(r1: Rexp, r2: Rexp) |
|
18 |
case STAR(r: Rexp) |
|
19 |
case RECD[A](label: A, r: Rexp) |
|
20 |
} |
|
21 |
import Rexp._ |
|
22 |
||
725 | 23 |
// values |
981 | 24 |
enum Val { |
25 |
case Empty |
|
26 |
case Chr(c: Char) |
|
27 |
case Sequ(v1: Val, v2: Val) |
|
28 |
case Left(v: Val) |
|
29 |
case Right(v: Val) |
|
30 |
case Stars(vs: List[Val]) |
|
31 |
case Rec[A](label: A, v: Val) |
|
32 |
} |
|
33 |
import Val._ |
|
34 |
||
725 | 35 |
// some convenience for typing in regular expressions |
981 | 36 |
import scala.language.implicitConversions |
725 | 37 |
|
38 |
def charlist2rexp(s : List[Char]): Rexp = s match { |
|
39 |
case Nil => ONE |
|
40 |
case c::Nil => CHAR(c) |
|
41 |
case c::s => SEQ(CHAR(c), charlist2rexp(s)) |
|
42 |
} |
|
959
64ec1884d860
updated and added pascal.while file
Christian Urban <christian.urban@kcl.ac.uk>
parents:
948
diff
changeset
|
43 |
|
64ec1884d860
updated and added pascal.while file
Christian Urban <christian.urban@kcl.ac.uk>
parents:
948
diff
changeset
|
44 |
given Conversion[String, Rexp] = (s => charlist2rexp(s.toList)) |
725 | 45 |
|
787 | 46 |
val HELLO : Rexp = "hello" |
47 |
||
936 | 48 |
extension (r: Rexp) { |
725 | 49 |
def | (s: Rexp) = ALT(r, s) |
50 |
def % = STAR(r) |
|
51 |
def ~ (s: Rexp) = SEQ(r, s) |
|
52 |
} |
|
53 |
||
948 | 54 |
// to use & for records, instead of $ which had |
55 |
// its precedence be changed in Scala 3 |
|
725 | 56 |
|
787 | 57 |
val TEST = ("ab" | "ba").% |
58 |
||
725 | 59 |
def nullable(r: Rexp) : Boolean = r match { |
60 |
case ZERO => false |
|
61 |
case ONE => true |
|
62 |
case CHAR(_) => false |
|
63 |
case ALT(r1, r2) => nullable(r1) || nullable(r2) |
|
64 |
case SEQ(r1, r2) => nullable(r1) && nullable(r2) |
|
65 |
case STAR(_) => true |
|
66 |
case RECD(_, r1) => nullable(r1) |
|
67 |
} |
|
68 |
||
69 |
def der(c: Char, r: Rexp) : Rexp = r match { |
|
70 |
case ZERO => ZERO |
|
71 |
case ONE => ZERO |
|
72 |
case CHAR(d) => if (c == d) ONE else ZERO |
|
73 |
case ALT(r1, r2) => ALT(der(c, r1), der(c, r2)) |
|
74 |
case SEQ(r1, r2) => |
|
75 |
if (nullable(r1)) ALT(SEQ(der(c, r1), r2), der(c, r2)) |
|
76 |
else SEQ(der(c, r1), r2) |
|
77 |
case STAR(r) => SEQ(der(c, r), STAR(r)) |
|
78 |
case RECD(_, r1) => der(c, r1) |
|
79 |
} |
|
80 |
||
81 |
||
742 | 82 |
// extracts a string from a value |
725 | 83 |
def flatten(v: Val) : String = v match { |
84 |
case Empty => "" |
|
85 |
case Chr(c) => c.toString |
|
86 |
case Left(v) => flatten(v) |
|
87 |
case Right(v) => flatten(v) |
|
88 |
case Sequ(v1, v2) => flatten(v1) ++ flatten(v2) |
|
89 |
case Stars(vs) => vs.map(flatten).mkString |
|
90 |
case Rec(_, v) => flatten(v) |
|
91 |
} |
|
92 |
||
93 |
||
94 |
// extracts an environment from a value; |
|
95 |
// used for tokenising a string |
|
981 | 96 |
//import scala.reflect.ClassTag |
97 |
||
98 |
def env[A](v: Val) : List[(A, String)] = v match { |
|
725 | 99 |
case Empty => Nil |
100 |
case Chr(c) => Nil |
|
101 |
case Left(v) => env(v) |
|
102 |
case Right(v) => env(v) |
|
103 |
case Sequ(v1, v2) => env(v1) ::: env(v2) |
|
104 |
case Stars(vs) => vs.flatMap(env) |
|
981 | 105 |
case Rec[A](x, v) => (x, flatten(v))::env(v) |
725 | 106 |
} |
107 |
||
108 |
||
109 |
// The injection and mkeps part of the lexer |
|
110 |
//=========================================== |
|
111 |
||
981 | 112 |
// the pattern-matches are defined to be @unchecked |
113 |
// because they do not need to be defined for |
|
114 |
// all cases |
|
115 |
||
116 |
def mkeps(r: Rexp) : Val = (r: @unchecked) match { |
|
725 | 117 |
case ONE => Empty |
118 |
case ALT(r1, r2) => |
|
119 |
if (nullable(r1)) Left(mkeps(r1)) else Right(mkeps(r2)) |
|
120 |
case SEQ(r1, r2) => Sequ(mkeps(r1), mkeps(r2)) |
|
121 |
case STAR(r) => Stars(Nil) |
|
122 |
case RECD(x, r) => Rec(x, mkeps(r)) |
|
123 |
} |
|
124 |
||
981 | 125 |
def inj(r: Rexp, c: Char, v: Val) : Val = ((r, v) : @unchecked) match { |
725 | 126 |
case (STAR(r), Sequ(v1, Stars(vs))) => Stars(inj(r, c, v1)::vs) |
127 |
case (SEQ(r1, r2), Sequ(v1, v2)) => Sequ(inj(r1, c, v1), v2) |
|
128 |
case (SEQ(r1, r2), Left(Sequ(v1, v2))) => Sequ(inj(r1, c, v1), v2) |
|
129 |
case (SEQ(r1, r2), Right(v2)) => Sequ(mkeps(r1), inj(r2, c, v2)) |
|
130 |
case (ALT(r1, r2), Left(v1)) => Left(inj(r1, c, v1)) |
|
131 |
case (ALT(r1, r2), Right(v2)) => Right(inj(r2, c, v2)) |
|
132 |
case (CHAR(d), Empty) => Chr(c) |
|
133 |
case (RECD(x, r1), _) => Rec(x, inj(r1, c, v)) |
|
134 |
} |
|
135 |
||
844 | 136 |
// lexing functions without simplification |
786 | 137 |
def lex(r: Rexp, s: List[Char]) : Val = s match { |
138 |
case Nil => if (nullable(r)) mkeps(r) else |
|
139 |
{ throw new Exception("lexing error") } |
|
140 |
case c::cs => inj(r, c, lex(der(c, r), cs)) |
|
725 | 141 |
} |
142 |
||
981 | 143 |
|
725 | 144 |
|
852 | 145 |
println(lex(("ab" | "a") ~ (ONE | "b"), "ab".toList)) |
146 |
||
147 |
println(lex(STAR("aa" | "a"), "aaa".toList)) |
|
725 | 148 |
|
981 | 149 |
println(lex(STAR(STAR("a")), "aaa".toList)) |
150 |
||
151 |
val re = ("a" | "ab") ~ ("c" | "bc") |
|
152 |
||
153 |
println(pders1("abc", re).toList.mkString("\n")) |
|
154 |
pders('a', pder('a', re)))) |
|
155 |
draw(simp(der('a', der('a', der('a', re))))) |
|
156 |
||
157 |
size(simp(ders(, re))) |
|
158 |
size(simp(der('a', der('a', re)))) |
|
159 |
size(simp(der('a', der('a', der('a', re))))) |
|
160 |
||
161 |
||
162 |
lex(re, "aaaaa".toList) |
|
163 |
||
725 | 164 |
// The Lexing Rules for the WHILE Language |
165 |
||
166 |
def PLUS(r: Rexp) = r ~ r.% |
|
167 |
||
168 |
def Range(s : List[Char]) : Rexp = s match { |
|
169 |
case Nil => ZERO |
|
170 |
case c::Nil => CHAR(c) |
|
171 |
case c::s => ALT(CHAR(c), Range(s)) |
|
172 |
} |
|
173 |
def RANGE(s: String) = Range(s.toList) |
|
174 |
||
800 | 175 |
val SYM = RANGE("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_") |
725 | 176 |
val DIGIT = RANGE("0123456789") |
177 |
val ID = SYM ~ (SYM | DIGIT).% |
|
178 |
val NUM = PLUS(DIGIT) |
|
179 |
val KEYWORD : Rexp = "skip" | "while" | "do" | "if" | "then" | "else" | "read" | "write" |
|
180 |
val SEMI: Rexp = ";" |
|
181 |
val OP: Rexp = ":=" | "=" | "-" | "+" | "*" | "!=" | "<" | ">" |
|
846 | 182 |
val WHITESPACE = PLUS(" " | "\n" | "\t" | "\r") |
787 | 183 |
val RPAREN: Rexp = "}" |
184 |
val LPAREN: Rexp = "{" |
|
725 | 185 |
val STRING: Rexp = "\"" ~ SYM.% ~ "\"" |
186 |
||
187 |
||
981 | 188 |
enum TAGS { |
189 |
case Key, Id, Op, Num, Semi, Str, Paren, Wht |
|
190 |
} |
|
191 |
import TAGS._ |
|
192 |
||
193 |
extension (t: TAGS) { |
|
194 |
def & (r: Rexp) = RECD[TAGS](t, r) |
|
195 |
} |
|
725 | 196 |
|
981 | 197 |
def lexing(r: Rexp, s: String) = |
198 |
env[TAGS](lex(r, s.toList)) |
|
786 | 199 |
|
981 | 200 |
val WHILE_REGS = ((Key & KEYWORD) | |
201 |
(Id & ID) | |
|
202 |
(Op & OP) | |
|
203 |
(Num & NUM) | |
|
204 |
(Semi & SEMI) | |
|
205 |
(Str & STRING) | |
|
206 |
(Paren & (LPAREN | RPAREN)) | |
|
207 |
(Wht & WHITESPACE)).% |
|
208 |
||
725 | 209 |
|
210 |
// Two Simple While Tests |
|
211 |
//======================== |
|
981 | 212 |
|
725 | 213 |
@main |
214 |
def small() = { |
|
215 |
||
786 | 216 |
val prog0 = """if""" |
725 | 217 |
println(s"test: $prog0") |
786 | 218 |
println(lexing(WHILE_REGS, prog0)) |
725 | 219 |
|
786 | 220 |
val prog1 = """iffoo""" |
725 | 221 |
println(s"test: $prog1") |
786 | 222 |
println(lexing(WHILE_REGS, prog1)) |
725 | 223 |
|
786 | 224 |
val prog2 = """read n; write n""" |
225 |
println(s"test: $prog2") |
|
226 |
println(lexing(WHILE_REGS, prog2)) |
|
725 | 227 |
} |
228 |