progs/scala/re2.scala
author Christian Urban <christian dot urban at kcl dot ac dot uk>
Fri, 18 Dec 2015 10:11:01 +0000
changeset 84 f89372781a4c
parent 34 33065bde3bbd
child 156 6a43ea9305ba
permissions -rw-r--r--
the algorithm is correct according to the Type Inference definition

import scala.language.implicitConversions    
import scala.language.reflectiveCalls
import scala.annotation.tailrec   

abstract class Rexp 
case object NULL extends Rexp
case object EMPTY extends Rexp
case class CHAR(c: Char) extends Rexp
case class ALT(r1: Rexp, r2: Rexp) extends Rexp 
case class SEQ(r1: Rexp, r2: Rexp) extends Rexp 
case class STAR(r: Rexp) extends Rexp 
case class RECD(x: String, r: Rexp) extends Rexp

abstract class Val
case object Void extends Val
case class Chr(c: Char) extends Val
case class Sequ(v1: Val, v2: Val) extends Val
case class Left(v: Val) extends Val
case class Right(v: Val) extends Val
case class Stars(vs: List[Val]) extends Val
case class Rec(x: String, v: Val) extends Val
   
// some convenience for typing in regular expressions
def charlist2rexp(s : List[Char]): Rexp = s match {
  case Nil => EMPTY
  case c::Nil => CHAR(c)
  case c::s => SEQ(CHAR(c), charlist2rexp(s))
}
implicit def string2rexp(s : String) : Rexp = charlist2rexp(s.toList)

implicit def RexpOps(r: Rexp) = new {
  def | (s: Rexp) = ALT(r, s)
  def % = STAR(r)
  def ~ (s: Rexp) = SEQ(r, s)
}

implicit def stringOps(s: String) = new {
  def | (r: Rexp) = ALT(s, r)
  def | (r: String) = ALT(s, r)
  def % = STAR(s)
  def ~ (r: Rexp) = SEQ(s, r)
  def ~ (r: String) = SEQ(s, r)
  def $ (r: Rexp) = RECD(s, r)
}

// size of a regular expressions - for testing purposes 
def size(r: Rexp) : Int = r match {
  case NULL => 1
  case EMPTY => 1
  case CHAR(_) => 1
  case ALT(r1, r2) => 1 + size(r1) + size(r2)
  case SEQ(r1, r2) => 1 + size(r1) + size(r2)
  case STAR(r) => 1 + size(r)
  case RECD(_, r) => 1 + size(r)
}


// nullable function: tests whether the regular 
// expression can recognise the empty string
def nullable (r: Rexp) : Boolean = r match {
  case NULL => false
  case EMPTY => true
  case CHAR(_) => false
  case ALT(r1, r2) => nullable(r1) || nullable(r2)
  case SEQ(r1, r2) => nullable(r1) && nullable(r2)
  case STAR(_) => true
  case RECD(_, r1) => nullable(r1)
}

// derivative of a regular expression w.r.t. a character
def der (c: Char, r: Rexp) : Rexp = r match {
  case NULL => NULL
  case EMPTY => NULL
  case CHAR(d) => if (c == d) EMPTY else NULL
  case ALT(r1, r2) => ALT(der(c, r1), der(c, r2))
  case SEQ(r1, r2) => 
    if (nullable(r1)) ALT(SEQ(der(c, r1), r2), der(c, r2))
    else SEQ(der(c, r1), r2)
  case STAR(r) => SEQ(der(c, r), STAR(r))
  case RECD(_, r1) => der(c, r1)
}

// derivative w.r.t. a string (iterates der)
def ders (s: List[Char], r: Rexp) : Rexp = s match {
  case Nil => r
  case c::s => ders(s, der(c, r))
}

// extracts a string from value
def flatten(v: Val) : String = v match {
  case Void => ""
  case Chr(c) => c.toString
  case Left(v) => flatten(v)
  case Right(v) => flatten(v)
  case Sequ(v1, v2) => flatten(v1) + flatten(v2)
  case Stars(vs) => vs.map(flatten).mkString
  case Rec(_, v) => flatten(v)
}

// extracts an environment from a value
def env(v: Val) : List[(String, String)] = v match {
  case Void => Nil
  case Chr(c) => Nil
  case Left(v) => env(v)
  case Right(v) => env(v)
  case Sequ(v1, v2) => env(v1) ::: env(v2)
  case Stars(vs) => vs.flatMap(env)
  case Rec(x, v) => (x, flatten(v))::env(v)
}

def mkeps_all(r: Rexp) : Set[Val] = r match {
  case EMPTY => Set(Void)
  case ALT(r1, r2) => (nullable(r1), nullable(r2)) match {
    case (true, true) => mkeps_all(r1).map(Left) ++ mkeps_all(r2).map(Right)
    case (true, false) => mkeps_all(r1).map(Left)
    case (false, true) => mkeps_all(r2).map(Right)
  }
  case SEQ(r1, r2) => for (v1 <- mkeps_all(r1);
                           v2 <- mkeps_all(r2)) yield Sequ(v1, v2)
  case STAR(r) => Set(Stars(Nil), Stars(List(mkeps(r))))
  case RECD(x, r) => for (v <-  mkeps_all(r)) yield Rec(x, v)
}

def inj(r: Rexp, c: Char, v: Val) : Val = (r, v) match {
  case (STAR(r), Sequ(v1, Stars(vs))) => Stars(inj(r, c, v1)::vs)
  case (SEQ(r1, r2), Sequ(v1, v2)) => Sequ(inj(r1, c, v1), v2)
  case (SEQ(r1, r2), Left(Sequ(v1, v2))) => Sequ(inj(r1, c, v1), v2)
  case (SEQ(r1, r2), Right(v2)) => Sequ(mkeps(r1), inj(r2, c, v2))
  case (ALT(r1, r2), Left(v1)) => Left(inj(r1, c, v1))
  case (ALT(r1, r2), Right(v2)) => Right(inj(r2, c, v2))
  case (CHAR(d), Void) => Chr(c) 
  case (RECD(x, r1), _) => Rec(x, inj(r1, c, v))
}

def inj_all(r: Rexp, c: Char, vs: Set[Val]) : Set[Val] =
  for (v <- vs) yield inj(r, c, v)

// main lexing function (produces a value)
def lex(r: Rexp, s: List[Char]) : Val = s match {
  case Nil => if (nullable(r)) mkeps(r) else throw new Exception("Not matched")
  case c::cs => inj(r, c, lex(der(c, r), cs))
}

def lexing(r: Rexp, s: String) : Val = lex(r, s.toList)

// Examples
val K: Rexp = "a" | "b"
val I: Rexp = "ab" | "ba"  

val R0 = (K | I).%

lexing(R0, "abab")

val K: Rexp = ("key" $ "a" | "b")
val I: Rexp = ("id" $ ("ab" | "ba"))  

val R0 = (K | I).%
lexing(R0, "abaa")
env(lexing(R0, "abaa"))

val r0: Rexp = (K | I).%
val r1 = der('a', r0)
val r1_simp = simp2(r1)
val r2 = der('b', r1)
val r2_simp = simp2(r2)
nullable(r2)
val v2 = mkeps(r2)
val v1 = inj(r1, 'b', v2)
val v0 = inj(r0, 'a', v1)
env(v0)
env(lexing(r0, "abab"))