| author | Christian Urban <christian.urban@kcl.ac.uk> | 
| Wed, 17 Sep 2025 15:11:48 +0100 | |
| changeset 985 | 4a530cad7804 | 
| parent 980 | 4f422766763f | 
| child 1010 | adc61c55e165 | 
| permissions | -rw-r--r-- | 
| 725 | 1  | 
// A simple lexer inspired by work of Sulzmann & Lu  | 
2  | 
//==================================================  | 
|
3  | 
//  | 
|
| 827 | 4  | 
// call with  | 
5  | 
//  | 
|
6  | 
// amm lex.sc  | 
|
7  | 
//  | 
|
8  | 
||
| 725 | 9  | 
|
| 980 | 10  | 
// regular expressions including recods  | 
11  | 
// for extracting strings or tokens  | 
|
12  | 
enum Rexp { 
 | 
|
13  | 
case ZERO  | 
|
14  | 
case ONE  | 
|
15  | 
case CHAR(c: Char)  | 
|
16  | 
case ALT(r1: Rexp, r2: Rexp)  | 
|
17  | 
case SEQ(r1: Rexp, r2: Rexp)  | 
|
18  | 
case STAR(r: Rexp)  | 
|
19  | 
case RECD[A](label: A, r: Rexp)  | 
|
20  | 
}  | 
|
21  | 
import Rexp._  | 
|
22  | 
||
| 725 | 23  | 
// values  | 
| 980 | 24  | 
enum Val {
 | 
25  | 
case Empty  | 
|
26  | 
case Chr(c: Char)  | 
|
27  | 
case Sequ(v1: Val, v2: Val)  | 
|
28  | 
case Left(v: Val)  | 
|
29  | 
case Right(v: Val)  | 
|
30  | 
case Stars(vs: List[Val])  | 
|
31  | 
case Rec[A](label: A, v: Val)  | 
|
32  | 
}  | 
|
33  | 
import Val._  | 
|
34  | 
||
| 725 | 35  | 
// some convenience for typing in regular expressions  | 
| 980 | 36  | 
import scala.language.implicitConversions  | 
| 725 | 37  | 
|
38  | 
def charlist2rexp(s : List[Char]): Rexp = s match {
 | 
|
39  | 
case Nil => ONE  | 
|
40  | 
case c::Nil => CHAR(c)  | 
|
41  | 
case c::s => SEQ(CHAR(c), charlist2rexp(s))  | 
|
42  | 
}  | 
|
| 
958
 
6caee1c0222e
updated and added pascal.while file
 
Christian Urban <christian.urban@kcl.ac.uk> 
parents: 
947 
diff
changeset
 | 
43  | 
|
| 
 
6caee1c0222e
updated and added pascal.while file
 
Christian Urban <christian.urban@kcl.ac.uk> 
parents: 
947 
diff
changeset
 | 
44  | 
given Conversion[String, Rexp] = (s => charlist2rexp(s.toList))  | 
| 725 | 45  | 
|
| 787 | 46  | 
val HELLO : Rexp = "hello"  | 
47  | 
||
| 935 | 48  | 
extension (r: Rexp) {
 | 
| 725 | 49  | 
def | (s: Rexp) = ALT(r, s)  | 
50  | 
def % = STAR(r)  | 
|
51  | 
def ~ (s: Rexp) = SEQ(r, s)  | 
|
52  | 
}  | 
|
53  | 
||
| 947 | 54  | 
// to use & for records, instead of $ which had  | 
55  | 
// its precedence be changed in Scala 3  | 
|
| 725 | 56  | 
|
| 787 | 57  | 
val TEST = ("ab" | "ba").%
 | 
58  | 
||
| 725 | 59  | 
def nullable(r: Rexp) : Boolean = r match {
 | 
60  | 
case ZERO => false  | 
|
61  | 
case ONE => true  | 
|
62  | 
case CHAR(_) => false  | 
|
63  | 
case ALT(r1, r2) => nullable(r1) || nullable(r2)  | 
|
64  | 
case SEQ(r1, r2) => nullable(r1) && nullable(r2)  | 
|
65  | 
case STAR(_) => true  | 
|
66  | 
case RECD(_, r1) => nullable(r1)  | 
|
67  | 
}  | 
|
68  | 
||
69  | 
def der(c: Char, r: Rexp) : Rexp = r match {
 | 
|
70  | 
case ZERO => ZERO  | 
|
71  | 
case ONE => ZERO  | 
|
72  | 
case CHAR(d) => if (c == d) ONE else ZERO  | 
|
73  | 
case ALT(r1, r2) => ALT(der(c, r1), der(c, r2))  | 
|
74  | 
case SEQ(r1, r2) =>  | 
|
75  | 
if (nullable(r1)) ALT(SEQ(der(c, r1), r2), der(c, r2))  | 
|
76  | 
else SEQ(der(c, r1), r2)  | 
|
77  | 
case STAR(r) => SEQ(der(c, r), STAR(r))  | 
|
78  | 
case RECD(_, r1) => der(c, r1)  | 
|
79  | 
}  | 
|
80  | 
||
81  | 
||
| 742 | 82  | 
// extracts a string from a value  | 
| 725 | 83  | 
def flatten(v: Val) : String = v match {
 | 
84  | 
case Empty => ""  | 
|
85  | 
case Chr(c) => c.toString  | 
|
86  | 
case Left(v) => flatten(v)  | 
|
87  | 
case Right(v) => flatten(v)  | 
|
88  | 
case Sequ(v1, v2) => flatten(v1) ++ flatten(v2)  | 
|
89  | 
case Stars(vs) => vs.map(flatten).mkString  | 
|
90  | 
case Rec(_, v) => flatten(v)  | 
|
91  | 
}  | 
|
92  | 
||
93  | 
||
94  | 
// extracts an environment from a value;  | 
|
95  | 
// used for tokenising a string  | 
|
| 980 | 96  | 
//import scala.reflect.ClassTag  | 
97  | 
||
98  | 
def env[A](v: Val) : List[(A, String)] = v match {
 | 
|
| 725 | 99  | 
case Empty => Nil  | 
100  | 
case Chr(c) => Nil  | 
|
101  | 
case Left(v) => env(v)  | 
|
102  | 
case Right(v) => env(v)  | 
|
103  | 
case Sequ(v1, v2) => env(v1) ::: env(v2)  | 
|
104  | 
case Stars(vs) => vs.flatMap(env)  | 
|
| 980 | 105  | 
case Rec[A](x, v) => (x, flatten(v))::env(v)  | 
| 725 | 106  | 
}  | 
107  | 
||
108  | 
||
109  | 
// The injection and mkeps part of the lexer  | 
|
110  | 
//===========================================  | 
|
111  | 
||
| 980 | 112  | 
// the pattern-matches are defined to be @unchecked  | 
113  | 
// because they do not need to be defined for  | 
|
114  | 
// all cases  | 
|
115  | 
||
116  | 
def mkeps(r: Rexp) : Val = (r: @unchecked) match {
 | 
|
| 725 | 117  | 
case ONE => Empty  | 
118  | 
case ALT(r1, r2) =>  | 
|
119  | 
if (nullable(r1)) Left(mkeps(r1)) else Right(mkeps(r2))  | 
|
120  | 
case SEQ(r1, r2) => Sequ(mkeps(r1), mkeps(r2))  | 
|
121  | 
case STAR(r) => Stars(Nil)  | 
|
122  | 
case RECD(x, r) => Rec(x, mkeps(r))  | 
|
123  | 
}  | 
|
124  | 
||
| 980 | 125  | 
def inj(r: Rexp, c: Char, v: Val) : Val = ((r, v) : @unchecked) match {
 | 
| 725 | 126  | 
case (STAR(r), Sequ(v1, Stars(vs))) => Stars(inj(r, c, v1)::vs)  | 
127  | 
case (SEQ(r1, r2), Sequ(v1, v2)) => Sequ(inj(r1, c, v1), v2)  | 
|
128  | 
case (SEQ(r1, r2), Left(Sequ(v1, v2))) => Sequ(inj(r1, c, v1), v2)  | 
|
129  | 
case (SEQ(r1, r2), Right(v2)) => Sequ(mkeps(r1), inj(r2, c, v2))  | 
|
130  | 
case (ALT(r1, r2), Left(v1)) => Left(inj(r1, c, v1))  | 
|
131  | 
case (ALT(r1, r2), Right(v2)) => Right(inj(r2, c, v2))  | 
|
132  | 
case (CHAR(d), Empty) => Chr(c)  | 
|
133  | 
case (RECD(x, r1), _) => Rec(x, inj(r1, c, v))  | 
|
134  | 
}  | 
|
135  | 
||
| 844 | 136  | 
// lexing functions without simplification  | 
| 786 | 137  | 
def lex(r: Rexp, s: List[Char]) : Val = s match {
 | 
138  | 
case Nil => if (nullable(r)) mkeps(r) else  | 
|
139  | 
    { throw new Exception("lexing error") } 
 | 
|
140  | 
case c::cs => inj(r, c, lex(der(c, r), cs))  | 
|
| 725 | 141  | 
}  | 
142  | 
||
| 980 | 143  | 
|
| 725 | 144  | 
|
| 852 | 145  | 
println(lex(("ab" | "a") ~ (ONE | "b"), "ab".toList))
 | 
146  | 
||
147  | 
println(lex(STAR("aa" | "a"), "aaa".toList))
 | 
|
| 725 | 148  | 
|
| 980 | 149  | 
println(lex(STAR(STAR("a")), "aaa".toList))
 | 
150  | 
||
151  | 
val re = ("a" | "ab") ~ ("c" | "bc")
 | 
|
152  | 
||
153  | 
println(pders1("abc", re).toList.mkString("\n"))
 | 
|
154  | 
pders('a', pder('a', re))))
 | 
|
155  | 
draw(simp(der('a', der('a', der('a', re)))))
 | 
|
156  | 
||
157  | 
size(simp(ders(, re)))  | 
|
158  | 
size(simp(der('a', der('a', re))))
 | 
|
159  | 
size(simp(der('a', der('a', der('a', re)))))
 | 
|
160  | 
||
161  | 
||
162  | 
lex(re, "aaaaa".toList)  | 
|
163  | 
||
| 725 | 164  | 
// The Lexing Rules for the WHILE Language  | 
165  | 
||
166  | 
def PLUS(r: Rexp) = r ~ r.%  | 
|
167  | 
||
168  | 
def Range(s : List[Char]) : Rexp = s match {
 | 
|
169  | 
case Nil => ZERO  | 
|
170  | 
case c::Nil => CHAR(c)  | 
|
171  | 
case c::s => ALT(CHAR(c), Range(s))  | 
|
172  | 
}  | 
|
173  | 
def RANGE(s: String) = Range(s.toList)  | 
|
174  | 
||
| 800 | 175  | 
val SYM = RANGE("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_")
 | 
| 725 | 176  | 
val DIGIT = RANGE("0123456789")
 | 
177  | 
val ID = SYM ~ (SYM | DIGIT).%  | 
|
178  | 
val NUM = PLUS(DIGIT)  | 
|
179  | 
val KEYWORD : Rexp = "skip" | "while" | "do" | "if" | "then" | "else" | "read" | "write"  | 
|
180  | 
val SEMI: Rexp = ";"  | 
|
181  | 
val OP: Rexp = ":=" | "=" | "-" | "+" | "*" | "!=" | "<" | ">"  | 
|
| 846 | 182  | 
val WHITESPACE = PLUS(" " | "\n" | "\t" | "\r")
 | 
| 787 | 183  | 
val RPAREN: Rexp = "}"  | 
184  | 
val LPAREN: Rexp = "{"
 | 
|
| 725 | 185  | 
val STRING: Rexp = "\"" ~ SYM.% ~ "\""  | 
186  | 
||
187  | 
||
| 980 | 188  | 
enum TAGS {
 | 
189  | 
case Key, Id, Op, Num, Semi, Str, Paren, Wht  | 
|
190  | 
}  | 
|
191  | 
import TAGS._  | 
|
192  | 
||
193  | 
extension (t: TAGS) {
 | 
|
194  | 
def & (r: Rexp) = RECD[TAGS](t, r)  | 
|
195  | 
}  | 
|
| 725 | 196  | 
|
| 980 | 197  | 
def lexing(r: Rexp, s: String) =  | 
198  | 
env[TAGS](lex(r, s.toList))  | 
|
| 786 | 199  | 
|
| 980 | 200  | 
val WHILE_REGS = ((Key & KEYWORD) |  | 
201  | 
(Id & ID) |  | 
|
202  | 
(Op & OP) |  | 
|
203  | 
(Num & NUM) |  | 
|
204  | 
(Semi & SEMI) |  | 
|
205  | 
(Str & STRING) |  | 
|
206  | 
(Paren & (LPAREN | RPAREN)) |  | 
|
207  | 
(Wht & WHITESPACE)).%  | 
|
208  | 
||
| 725 | 209  | 
|
210  | 
// Two Simple While Tests  | 
|
211  | 
//========================  | 
|
| 980 | 212  | 
|
| 725 | 213  | 
@main  | 
214  | 
def small() = {
 | 
|
215  | 
||
| 786 | 216  | 
val prog0 = """if"""  | 
| 725 | 217  | 
println(s"test: $prog0")  | 
| 786 | 218  | 
println(lexing(WHILE_REGS, prog0))  | 
| 725 | 219  | 
|
| 786 | 220  | 
val prog1 = """iffoo"""  | 
| 725 | 221  | 
println(s"test: $prog1")  | 
| 786 | 222  | 
println(lexing(WHILE_REGS, prog1))  | 
| 725 | 223  | 
|
| 786 | 224  | 
val prog2 = """read n; write n"""  | 
225  | 
println(s"test: $prog2")  | 
|
226  | 
println(lexing(WHILE_REGS, prog2))  | 
|
| 725 | 227  | 
}  | 
228  |