progs/matcher/re1.sc
author Christian Urban <christian.urban@kcl.ac.uk>
Fri, 03 Oct 2025 10:10:33 +0100
changeset 998 69eddde11a65
parent 981 14e5ae1fb541
permissions -rw-r--r--
updated

// A simple matcher for basic regular expressions
//
// Call the testcases with X = {1,2,3}
//
//   amm re1.sc testX
//
// or 
//
//   amm re1.sc all
//


// regular expressions (as enum in Scala 3)
enum Rexp {
  case ZERO                     // matches nothing
  case ONE                      // matches an empty string
  case CHAR(c: Char)            // matches a character c
  case ALT(r1: Rexp, r2: Rexp)  // alternative
  case SEQ(r1: Rexp, r2: Rexp)  // sequence
  case STAR(r: Rexp)            // star
}
import Rexp._


/* 
UPDATE: The videos and handouts still us the older syntax 
with classes, which still works but is more verbose

abstract class Rexp
case object ZERO extends Rexp
case object ONE extends Rexp
case class CHAR(c: Char) extends Rexp
case class ALT(r1: Rexp, r2: Rexp) extends Rexp
case class SEQ(r1: Rexp, r2: Rexp) extends Rexp
case class STAR(r: Rexp) extends Rexp
*/

// nullable function: tests whether a regular 
// expression can recognise the empty string  
def nullable(r: Rexp) : Boolean = r match {
  case ZERO => false
  case ONE => true
  case CHAR(_) => false
  case ALT(r1, r2) => nullable(r1) || nullable(r2)
  case SEQ(r1, r2) => nullable(r1) && nullable(r2)
  case STAR(_) => true
}

// the derivative of a regular expression w.r.t. a character
def der(c: Char, r: Rexp) : Rexp = r match {
  case ZERO => ZERO
  case ONE => ZERO
  case CHAR(d) => if (c == d) ONE else ZERO
  case ALT(r1, r2) => ALT(der(c, r1), der(c, r2))
  case SEQ(r1, r2) => 
    if (nullable(r1)) ALT(SEQ(der(c, r1), r2), der(c, r2))
    else SEQ(der(c, r1), r2)
  case STAR(r1) => SEQ(der(c, r1), STAR(r1))
}

// the derivative w.r.t. a string (iterates der)
def ders(s: List[Char], r: Rexp) : Rexp = s match {
  case Nil => r
  case c::s => ders(s, der(c, r))
}

// the main matcher function
def matcher(r: Rexp, s: String) : Boolean = 
  nullable(ders(s.toList, r))


// some examples from the homework

val r = SEQ(CHAR('a'), CHAR('c'))
matcher(r, "ac")

val r1 = STAR(ALT(SEQ(CHAR('a'), CHAR('b')), CHAR('b')))
der('a', r)
der('b', r)
der('c', r)

val r2 = SEQ(SEQ(CHAR('x'), CHAR('y')), CHAR('z'))
der('x', r2)
der('y', der('x', r2))
der('z', der('y', der('x', r2)))


// Test Cases
//============

// the optional regular expression (one or zero times)
def OPT(r: Rexp) = ALT(r, ONE)   // r + 1

// the n-times regular expression (explicitly expanded to SEQs)
def NTIMES(r: Rexp, n: Int) : Rexp = n match {
  case 0 => ONE
  case 1 => r
  case n => SEQ(r, NTIMES(r, n - 1))
}

// the evil regular expression  (a?){n} a{n}
def EVIL1(n: Int) = 
  SEQ(NTIMES(OPT(CHAR('a')), n), NTIMES(CHAR('a'), n))

// the evil regular expression (a*)* b
val EVIL2 = SEQ(STAR(STAR(CHAR('a'))), CHAR('b'))

// for measuring time
def time_needed[T](i: Int, code: => T) = {
  val start = System.nanoTime()
  for (j <- 1 to i) code
  val end = System.nanoTime()
  (end - start) / (i * 1.0e9)
}


// test: (a?{n}) (a{n})
@main
def test1() = {
  println("Test (a?{n}) (a{n})")

  for (i <- 0 to 20 by 2) {
    println(f"$i: ${time_needed(2, matcher(EVIL1(i), "a" * i))}%.5f")
  }
}

// test: (a*)* b
@main
def test2() = {
  println("Test (a*)* b")

  for (i <- 0 to 20 by 2) {
    println(f"$i: ${time_needed(2, matcher(EVIL2, "a" * i))}%.5f")
  }
}




// the size of a regular expressions - for testing purposes 
def size(r: Rexp) : Int = r match {
  case ZERO => 1
  case ONE => 1
  case CHAR(_) => 1
  case ALT(r1, r2) => 1 + size(r1) + size(r2)
  case SEQ(r1, r2) => 1 + size(r1) + size(r2)
  case STAR(r) => 1 + size(r)
}

// the expicit expansion in EVIL1(n) increases
// drastically its size - (a?){n} a{n}

size(EVIL1(1))  // 5
size(EVIL1(3))  // 17
size(EVIL1(5))  // 29
size(EVIL1(7))  // 41
size(EVIL1(20)) // 119

size(ders(("a" * 20).toList, EVIL1(20))) 


// given a regular expression and building successive
// derivatives might result into bigger and bigger
// regular expressions...here is an example for this:


// (a + aa)*
val BIG = STAR(ALT(CHAR('a'), SEQ(CHAR('a'), CHAR('a'))))

size(ders("".toList, BIG))              // 13
size(ders("aa".toList, BIG))            // 51
size(ders("aaaa".toList, BIG))          // 112
size(ders("aaaaaa".toList, BIG))        // 191
size(ders("aaaaaaaa".toList, BIG))      // 288
size(ders("aaaaaaaaaa".toList, BIG))    // 403
size(ders("aaaaaaaaaaaa".toList, BIG))  // 536


size(ders(("a" * 30).toList, BIG))      // 31010539

@main
def test3() = {
  println("Test (a + aa)*")

  for (i <- 0 to 30 by 5) {
    println(f"$i: ${time_needed(2, matcher(BIG, "a" * i))}%.5f")
  }
}



// Some code for pretty printing regexes as trees

def implode(ss: Seq[String]) = ss.mkString("\n")
def explode(s: String) = s.split("\n").toList

def lst(s: String) : String = explode(s) match {
  case hd :: tl => implode(" └" ++ hd :: tl.map("  " ++ _))
  case Nil => ""
}

def mid(s: String) : String = explode(s) match {
  case hd :: tl => implode(" ├" ++ hd :: tl.map(" │" ++ _))
  case Nil => ""
}

def indent(ss: Seq[String]) : String = ss match {
  case init :+ last => implode(init.map(mid) :+ lst(last))
  case _ => ""
}

def pp(e: Rexp) : String = e match {
  case ZERO => "0\n"
  case ONE => "1\n"
  case CHAR(c) => s"$c\n"
  case ALT(r1, r2) => "ALT\n" ++ pps(r1, r2)
  case SEQ(r1, r2) => "SEQ\n" ++ pps(r1, r2)
  case STAR(r) => "STAR\n" ++ pps(r)
}
def pps(es: Rexp*) = indent(es.map(pp))


@main
def test4() = {
  println(pp(r2))
  println(pp(ders("x".toList, r2)))
  println(pp(ders("xy".toList, r2)))
  println(pp(ders("xyz".toList, r2)))
}

@main
def all() = { test1(); test2() ; test3() ; test4() }