progs/lexer/lex.sc
author Christian Urban <christian.urban@kcl.ac.uk>
Fri, 05 Sep 2025 16:59:48 +0100
changeset 981 14e5ae1fb541
parent 959 64ec1884d860
permissions -rw-r--r--
updated

// A simple lexer inspired by work of Sulzmann & Lu
//==================================================
//
// call with
//
//   amm lex.sc 
//


// regular expressions including recods 
// for extracting strings or tokens
enum Rexp { 
  case ZERO 
  case ONE 
  case CHAR(c: Char)
  case ALT(r1: Rexp, r2: Rexp)
  case SEQ(r1: Rexp, r2: Rexp)
  case STAR(r: Rexp) 
  case RECD[A](label: A, r: Rexp)
}
import Rexp._  

// values  
enum Val {
  case Empty 
  case Chr(c: Char) 
  case Sequ(v1: Val, v2: Val) 
  case Left(v: Val) 
  case Right(v: Val) 
  case Stars(vs: List[Val]) 
  case Rec[A](label: A, v: Val)
}   
import Val._

// some convenience for typing in regular expressions
import scala.language.implicitConversions

def charlist2rexp(s : List[Char]): Rexp = s match {
  case Nil => ONE
  case c::Nil => CHAR(c)
  case c::s => SEQ(CHAR(c), charlist2rexp(s))
}

given Conversion[String, Rexp] = (s => charlist2rexp(s.toList))

val HELLO : Rexp = "hello"

extension (r: Rexp) {
  def | (s: Rexp) = ALT(r, s)
  def % = STAR(r)
  def ~ (s: Rexp) = SEQ(r, s)
}

// to use & for records, instead of $ which had
// its precedence be changed in Scala 3

val TEST = ("ab" | "ba").%

def nullable(r: Rexp) : Boolean = r match {
  case ZERO => false
  case ONE => true
  case CHAR(_) => false
  case ALT(r1, r2) => nullable(r1) || nullable(r2)
  case SEQ(r1, r2) => nullable(r1) && nullable(r2)
  case STAR(_) => true
  case RECD(_, r1) => nullable(r1)
}

def der(c: Char, r: Rexp) : Rexp = r match {
  case ZERO => ZERO
  case ONE => ZERO
  case CHAR(d) => if (c == d) ONE else ZERO
  case ALT(r1, r2) => ALT(der(c, r1), der(c, r2))
  case SEQ(r1, r2) => 
    if (nullable(r1)) ALT(SEQ(der(c, r1), r2), der(c, r2))
    else SEQ(der(c, r1), r2)
  case STAR(r) => SEQ(der(c, r), STAR(r))
  case RECD(_, r1) => der(c, r1)
}


// extracts a string from a value
def flatten(v: Val) : String = v match {
  case Empty => ""
  case Chr(c) => c.toString
  case Left(v) => flatten(v)
  case Right(v) => flatten(v)
  case Sequ(v1, v2) => flatten(v1) ++ flatten(v2)
  case Stars(vs) => vs.map(flatten).mkString
  case Rec(_, v) => flatten(v)
}


// extracts an environment from a value;
// used for tokenising a string
//import scala.reflect.ClassTag

def env[A](v: Val) : List[(A, String)] = v match {
  case Empty => Nil
  case Chr(c) => Nil
  case Left(v) => env(v)
  case Right(v) => env(v)
  case Sequ(v1, v2) => env(v1) ::: env(v2)
  case Stars(vs) => vs.flatMap(env)
  case Rec[A](x, v) => (x, flatten(v))::env(v)  
}


// The injection and mkeps part of the lexer
//===========================================

// the pattern-matches are defined to be @unchecked
// because they do not need to be defined for
// all cases

def mkeps(r: Rexp) : Val = (r: @unchecked) match {
  case ONE => Empty
  case ALT(r1, r2) => 
    if (nullable(r1)) Left(mkeps(r1)) else Right(mkeps(r2))
  case SEQ(r1, r2) => Sequ(mkeps(r1), mkeps(r2))
  case STAR(r) => Stars(Nil)
  case RECD(x, r) => Rec(x, mkeps(r))
}

def inj(r: Rexp, c: Char, v: Val) : Val = ((r, v) : @unchecked) match {
  case (STAR(r), Sequ(v1, Stars(vs))) => Stars(inj(r, c, v1)::vs)
  case (SEQ(r1, r2), Sequ(v1, v2)) => Sequ(inj(r1, c, v1), v2)
  case (SEQ(r1, r2), Left(Sequ(v1, v2))) => Sequ(inj(r1, c, v1), v2)
  case (SEQ(r1, r2), Right(v2)) => Sequ(mkeps(r1), inj(r2, c, v2))
  case (ALT(r1, r2), Left(v1)) => Left(inj(r1, c, v1))
  case (ALT(r1, r2), Right(v2)) => Right(inj(r2, c, v2))
  case (CHAR(d), Empty) => Chr(c) 
  case (RECD(x, r1), _) => Rec(x, inj(r1, c, v))
}

// lexing functions without simplification
def lex(r: Rexp, s: List[Char]) : Val = s match {
  case Nil => if (nullable(r)) mkeps(r) else 
    { throw new Exception("lexing error") } 
  case c::cs => inj(r, c, lex(der(c, r), cs))
}



println(lex(("ab" | "a") ~ (ONE | "b"), "ab".toList))

println(lex(STAR("aa" | "a"), "aaa".toList))

println(lex(STAR(STAR("a")), "aaa".toList))

val re = ("a" | "ab") ~ ("c" | "bc")

println(pders1("abc", re).toList.mkString("\n"))
pders('a', pder('a', re))))
draw(simp(der('a', der('a', der('a', re)))))

size(simp(ders(, re)))
size(simp(der('a', der('a', re))))
size(simp(der('a', der('a', der('a', re)))))


lex(re, "aaaaa".toList)

// The Lexing Rules for the WHILE Language

def PLUS(r: Rexp) = r ~ r.%

def Range(s : List[Char]) : Rexp = s match {
  case Nil => ZERO
  case c::Nil => CHAR(c)
  case c::s => ALT(CHAR(c), Range(s))
}
def RANGE(s: String) = Range(s.toList)

val SYM = RANGE("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_")
val DIGIT = RANGE("0123456789")
val ID = SYM ~ (SYM | DIGIT).% 
val NUM = PLUS(DIGIT)
val KEYWORD : Rexp = "skip" | "while" | "do" | "if" | "then" | "else" | "read" | "write" 
val SEMI: Rexp = ";"
val OP: Rexp = ":=" | "=" | "-" | "+" | "*" | "!=" | "<" | ">"
val WHITESPACE = PLUS(" " | "\n" | "\t" | "\r")
val RPAREN: Rexp = "}"
val LPAREN: Rexp = "{"
val STRING: Rexp = "\"" ~ SYM.% ~ "\""


enum TAGS {
  case Key, Id, Op, Num, Semi, Str, Paren, Wht
}
import TAGS._

extension (t: TAGS) {
  def & (r: Rexp) = RECD[TAGS](t, r)
}

def lexing(r: Rexp, s: String) = 
  env[TAGS](lex(r, s.toList))

val WHILE_REGS = ((Key & KEYWORD) | 
                  (Id & ID) | 
                  (Op & OP) | 
                  (Num & NUM) | 
                  (Semi & SEMI) | 
                  (Str & STRING) |
                  (Paren & (LPAREN | RPAREN)) | 
                  (Wht & WHITESPACE)).%


// Two Simple While Tests
//========================
 
@main
def small() = {

  val prog0 = """if"""
  println(s"test: $prog0")
  println(lexing(WHILE_REGS, prog0))

  val prog1 = """iffoo"""
  println(s"test: $prog1")
  println(lexing(WHILE_REGS, prog1))

  val prog2 = """read  n; write n"""  
  println(s"test: $prog2")
  println(lexing(WHILE_REGS, prog2))
}