| author | Christian Urban <christian.urban@kcl.ac.uk> | 
| Fri, 24 Oct 2025 11:26:43 +0100 | |
| changeset 1018 | fd6a64c53f0e | 
| parent 1010 | adc61c55e165 | 
| child 1019 | f71399fe3fdc | 
| permissions | -rw-r--r-- | 
| 725 | 1  | 
// A simple lexer inspired by work of Sulzmann & Lu  | 
2  | 
//==================================================  | 
|
3  | 
//  | 
|
| 827 | 4  | 
// call with  | 
5  | 
//  | 
|
6  | 
// amm lex.sc  | 
|
7  | 
//  | 
|
8  | 
||
| 725 | 9  | 
|
| 980 | 10  | 
// regular expressions including recods  | 
11  | 
// for extracting strings or tokens  | 
|
12  | 
enum Rexp { 
 | 
|
13  | 
case ZERO  | 
|
14  | 
case ONE  | 
|
15  | 
case CHAR(c: Char)  | 
|
16  | 
case ALT(r1: Rexp, r2: Rexp)  | 
|
17  | 
case SEQ(r1: Rexp, r2: Rexp)  | 
|
18  | 
case STAR(r: Rexp)  | 
|
19  | 
case RECD[A](label: A, r: Rexp)  | 
|
20  | 
}  | 
|
21  | 
import Rexp._  | 
|
22  | 
||
| 725 | 23  | 
// values  | 
| 980 | 24  | 
enum Val {
 | 
25  | 
case Empty  | 
|
26  | 
case Chr(c: Char)  | 
|
27  | 
case Sequ(v1: Val, v2: Val)  | 
|
28  | 
case Left(v: Val)  | 
|
29  | 
case Right(v: Val)  | 
|
30  | 
case Stars(vs: List[Val])  | 
|
31  | 
case Rec[A](label: A, v: Val)  | 
|
32  | 
}  | 
|
33  | 
import Val._  | 
|
34  | 
||
| 725 | 35  | 
// some convenience for typing in regular expressions  | 
| 980 | 36  | 
import scala.language.implicitConversions  | 
| 725 | 37  | 
|
38  | 
def charlist2rexp(s : List[Char]): Rexp = s match {
 | 
|
39  | 
case Nil => ONE  | 
|
40  | 
case c::Nil => CHAR(c)  | 
|
41  | 
case c::s => SEQ(CHAR(c), charlist2rexp(s))  | 
|
42  | 
}  | 
|
| 
958
 
6caee1c0222e
updated and added pascal.while file
 
Christian Urban <christian.urban@kcl.ac.uk> 
parents: 
947 
diff
changeset
 | 
43  | 
|
| 
 
6caee1c0222e
updated and added pascal.while file
 
Christian Urban <christian.urban@kcl.ac.uk> 
parents: 
947 
diff
changeset
 | 
44  | 
given Conversion[String, Rexp] = (s => charlist2rexp(s.toList))  | 
| 725 | 45  | 
|
| 787 | 46  | 
val HELLO : Rexp = "hello"  | 
47  | 
||
| 935 | 48  | 
extension (r: Rexp) {
 | 
| 725 | 49  | 
def | (s: Rexp) = ALT(r, s)  | 
50  | 
def % = STAR(r)  | 
|
51  | 
def ~ (s: Rexp) = SEQ(r, s)  | 
|
52  | 
}  | 
|
53  | 
||
| 787 | 54  | 
val TEST = ("ab" | "ba").%
 | 
55  | 
||
| 725 | 56  | 
def nullable(r: Rexp) : Boolean = r match {
 | 
57  | 
case ZERO => false  | 
|
58  | 
case ONE => true  | 
|
59  | 
case CHAR(_) => false  | 
|
60  | 
case ALT(r1, r2) => nullable(r1) || nullable(r2)  | 
|
61  | 
case SEQ(r1, r2) => nullable(r1) && nullable(r2)  | 
|
62  | 
case STAR(_) => true  | 
|
63  | 
case RECD(_, r1) => nullable(r1)  | 
|
64  | 
}  | 
|
65  | 
||
66  | 
def der(c: Char, r: Rexp) : Rexp = r match {
 | 
|
67  | 
case ZERO => ZERO  | 
|
68  | 
case ONE => ZERO  | 
|
69  | 
case CHAR(d) => if (c == d) ONE else ZERO  | 
|
70  | 
case ALT(r1, r2) => ALT(der(c, r1), der(c, r2))  | 
|
71  | 
case SEQ(r1, r2) =>  | 
|
72  | 
if (nullable(r1)) ALT(SEQ(der(c, r1), r2), der(c, r2))  | 
|
73  | 
else SEQ(der(c, r1), r2)  | 
|
74  | 
case STAR(r) => SEQ(der(c, r), STAR(r))  | 
|
75  | 
case RECD(_, r1) => der(c, r1)  | 
|
76  | 
}  | 
|
77  | 
||
78  | 
||
| 742 | 79  | 
// extracts a string from a value  | 
| 725 | 80  | 
def flatten(v: Val) : String = v match {
 | 
81  | 
case Empty => ""  | 
|
82  | 
case Chr(c) => c.toString  | 
|
83  | 
case Left(v) => flatten(v)  | 
|
84  | 
case Right(v) => flatten(v)  | 
|
85  | 
case Sequ(v1, v2) => flatten(v1) ++ flatten(v2)  | 
|
86  | 
case Stars(vs) => vs.map(flatten).mkString  | 
|
87  | 
case Rec(_, v) => flatten(v)  | 
|
88  | 
}  | 
|
89  | 
||
90  | 
||
91  | 
// extracts an environment from a value;  | 
|
92  | 
// used for tokenising a string  | 
|
| 980 | 93  | 
//import scala.reflect.ClassTag  | 
94  | 
||
95  | 
def env[A](v: Val) : List[(A, String)] = v match {
 | 
|
| 725 | 96  | 
case Empty => Nil  | 
97  | 
case Chr(c) => Nil  | 
|
98  | 
case Left(v) => env(v)  | 
|
99  | 
case Right(v) => env(v)  | 
|
100  | 
case Sequ(v1, v2) => env(v1) ::: env(v2)  | 
|
101  | 
case Stars(vs) => vs.flatMap(env)  | 
|
| 980 | 102  | 
case Rec[A](x, v) => (x, flatten(v))::env(v)  | 
| 725 | 103  | 
}  | 
104  | 
||
105  | 
||
106  | 
// The injection and mkeps part of the lexer  | 
|
107  | 
//===========================================  | 
|
108  | 
||
| 980 | 109  | 
// the pattern-matches are defined to be @unchecked  | 
110  | 
// because they do not need to be defined for  | 
|
111  | 
// all cases  | 
|
112  | 
||
113  | 
def mkeps(r: Rexp) : Val = (r: @unchecked) match {
 | 
|
| 725 | 114  | 
case ONE => Empty  | 
115  | 
case ALT(r1, r2) =>  | 
|
116  | 
if (nullable(r1)) Left(mkeps(r1)) else Right(mkeps(r2))  | 
|
117  | 
case SEQ(r1, r2) => Sequ(mkeps(r1), mkeps(r2))  | 
|
118  | 
case STAR(r) => Stars(Nil)  | 
|
119  | 
case RECD(x, r) => Rec(x, mkeps(r))  | 
|
120  | 
}  | 
|
121  | 
||
| 980 | 122  | 
def inj(r: Rexp, c: Char, v: Val) : Val = ((r, v) : @unchecked) match {
 | 
| 725 | 123  | 
case (STAR(r), Sequ(v1, Stars(vs))) => Stars(inj(r, c, v1)::vs)  | 
124  | 
case (SEQ(r1, r2), Sequ(v1, v2)) => Sequ(inj(r1, c, v1), v2)  | 
|
125  | 
case (SEQ(r1, r2), Left(Sequ(v1, v2))) => Sequ(inj(r1, c, v1), v2)  | 
|
126  | 
case (SEQ(r1, r2), Right(v2)) => Sequ(mkeps(r1), inj(r2, c, v2))  | 
|
127  | 
case (ALT(r1, r2), Left(v1)) => Left(inj(r1, c, v1))  | 
|
128  | 
case (ALT(r1, r2), Right(v2)) => Right(inj(r2, c, v2))  | 
|
129  | 
case (CHAR(d), Empty) => Chr(c)  | 
|
130  | 
case (RECD(x, r1), _) => Rec(x, inj(r1, c, v))  | 
|
131  | 
}  | 
|
132  | 
||
| 844 | 133  | 
// lexing functions without simplification  | 
| 786 | 134  | 
def lex(r: Rexp, s: List[Char]) : Val = s match {
 | 
135  | 
case Nil => if (nullable(r)) mkeps(r) else  | 
|
136  | 
    { throw new Exception("lexing error") } 
 | 
|
137  | 
case c::cs => inj(r, c, lex(der(c, r), cs))  | 
|
| 725 | 138  | 
}  | 
139  | 
||
| 980 | 140  | 
|
| 725 | 141  | 
|
| 852 | 142  | 
println(lex(("ab" | "a") ~ (ONE | "b"), "ab".toList))
 | 
143  | 
||
144  | 
println(lex(STAR("aa" | "a"), "aaa".toList))
 | 
|
| 725 | 145  | 
|
| 980 | 146  | 
println(lex(STAR(STAR("a")), "aaa".toList))
 | 
147  | 
||
148  | 
||
| 725 | 149  | 
// The Lexing Rules for the WHILE Language  | 
150  | 
||
151  | 
def PLUS(r: Rexp) = r ~ r.%  | 
|
152  | 
||
153  | 
def Range(s : List[Char]) : Rexp = s match {
 | 
|
154  | 
case Nil => ZERO  | 
|
155  | 
case c::Nil => CHAR(c)  | 
|
156  | 
case c::s => ALT(CHAR(c), Range(s))  | 
|
157  | 
}  | 
|
158  | 
def RANGE(s: String) = Range(s.toList)  | 
|
159  | 
||
| 800 | 160  | 
val SYM = RANGE("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_")
 | 
| 725 | 161  | 
val DIGIT = RANGE("0123456789")
 | 
162  | 
val ID = SYM ~ (SYM | DIGIT).%  | 
|
163  | 
val NUM = PLUS(DIGIT)  | 
|
164  | 
val KEYWORD : Rexp = "skip" | "while" | "do" | "if" | "then" | "else" | "read" | "write"  | 
|
165  | 
val SEMI: Rexp = ";"  | 
|
166  | 
val OP: Rexp = ":=" | "=" | "-" | "+" | "*" | "!=" | "<" | ">"  | 
|
| 846 | 167  | 
val WHITESPACE = PLUS(" " | "\n" | "\t" | "\r")
 | 
| 787 | 168  | 
val RPAREN: Rexp = "}"  | 
169  | 
val LPAREN: Rexp = "{"
 | 
|
| 725 | 170  | 
val STRING: Rexp = "\"" ~ SYM.% ~ "\""  | 
171  | 
||
172  | 
||
| 980 | 173  | 
enum TAGS {
 | 
174  | 
case Key, Id, Op, Num, Semi, Str, Paren, Wht  | 
|
175  | 
}  | 
|
176  | 
import TAGS._  | 
|
177  | 
||
| 1010 | 178  | 
|
| 980 | 179  | 
extension (t: TAGS) {
 | 
| 1010 | 180  | 
def $ (r: Rexp) = RECD[TAGS](t, r)  | 
| 980 | 181  | 
}  | 
| 725 | 182  | 
|
| 980 | 183  | 
def lexing(r: Rexp, s: String) =  | 
184  | 
env[TAGS](lex(r, s.toList))  | 
|
| 786 | 185  | 
|
| 1010 | 186  | 
val WHILE_REGS = ((Key $ KEYWORD) |  | 
187  | 
(Id $ ID) |  | 
|
188  | 
(Op $ OP) |  | 
|
189  | 
(Num $ NUM) |  | 
|
190  | 
(Semi $ SEMI) |  | 
|
191  | 
(Str $ STRING) |  | 
|
192  | 
(Paren $ (LPAREN | RPAREN)) |  | 
|
193  | 
(Wht $ WHITESPACE)).%  | 
|
| 980 | 194  | 
|
| 725 | 195  | 
|
196  | 
// Two Simple While Tests  | 
|
197  | 
//========================  | 
|
| 980 | 198  | 
|
| 725 | 199  | 
@main  | 
200  | 
def small() = {
 | 
|
201  | 
||
| 786 | 202  | 
val prog0 = """if"""  | 
| 725 | 203  | 
println(s"test: $prog0")  | 
| 786 | 204  | 
println(lexing(WHILE_REGS, prog0))  | 
| 725 | 205  | 
|
| 786 | 206  | 
val prog1 = """iffoo"""  | 
| 725 | 207  | 
println(s"test: $prog1")  | 
| 786 | 208  | 
println(lexing(WHILE_REGS, prog1))  | 
| 725 | 209  | 
|
| 786 | 210  | 
val prog2 = """read n; write n"""  | 
211  | 
println(s"test: $prog2")  | 
|
212  | 
println(lexing(WHILE_REGS, prog2))  | 
|
| 725 | 213  | 
}  | 
214  |