Brexp.scala
author Chengsong
Fri, 15 Mar 2019 12:27:12 +0000
changeset 4 7a349fe58bf4
parent 2 cf169411b771
child 5 622ddbb1223a
permissions -rwxr-xr-x
:test whether bsimp(bders(r, s)) == ders_simp(r,s) This does not hold. As illustrated by the output of the example where string is abaa and regex is (a* + ab)* this property still holds when s = aba but after the derivative of the last character a, the result is negative. However this seems easily fixable as the difference is relatively small, it might be possible to find where the difference happens and by taking that difference into accound we have some function f s.t. f(bsimp(r,s)) = ders_simp(r,s) and mkeps(f(r)) = mkeps(r) This will complete the correctness proof. Even if finding such f is hard, it would still provide insights for the real difference between simp(der(simp(der(simp(der...... and simp(ders

import RexpRelated._
import RexpRelated.Rexp._
import Spiral._
import scala.collection.mutable.ArrayBuffer
abstract class BRexp
case object BZERO extends BRexp
case object BONE extends BRexp
case class BCHAR(c: Char) extends BRexp
case class BALTS(b: Bit, rs: List[BRexp]) extends BRexp 
case class BSEQ(r1: BRexp, r2: BRexp) extends BRexp 
case class BSTAR(r: BRexp) extends BRexp 

object BRexp{
  def brternalise(r: Rexp) : BRexp = r match {//remember the initial shadowed bit should be set to Z as there 
  //are no enclosing STAR or SEQ
    case ZERO => BZERO
    case ONE => BONE
    case CHAR(c) => BCHAR(c)
    case ALTS(rs) => BALTS(Z,  rs.map(brternalise))
    case SEQ(r1, r2) => BSEQ( brternalise(r1), brternalise(r2) )
    case STAR(r) => BSTAR(brternalise(r))
    case RECD(x, r) => brternalise(r)
  }
  def brnullable (r: BRexp) : Boolean = r match {
    case BZERO => false
    case BONE => true
    case BCHAR(_) => false
    case BALTS(bs, rs) => rs.exists(brnullable)
    case BSEQ(r1, r2) => brnullable(r1) && brnullable(r2)
    case BSTAR(_) => true
  }
  //this function tells bspill when converting a brexp into a list of rexps
  //the conversion : (a+b)*c -> {a*c, b*c} can only happen when a+b is generated from scratch (e.g. when deriving a seq)
  //or is from the original regex but have been "touched" (i.e. have been derived)
  //Why make this distinction? Look at the following example:
  //r = (c+cb)+c(a+b)
  // r\c = (1+b)+(a+b)
  //after simplification
  // (r\c)simp= 1+b+a
  //we lost the structure that tells us 1+b should be grouped together and a grouped as itself
  //however in our brexp simplification, 
  //(r\c)br_simp = 1+b+(a+b)
  //we do not allow the bracket to be opened as it is from the original expression and have not been touched
  def brder(c: Char, r: BRexp) : BRexp = r match {
    case BZERO => BZERO
    case BONE => BZERO
    case BCHAR(d) => if (c == d) BONE else BZERO
    case BALTS(bs, rs) => BALTS(S, rs.map(brder(c, _)))//After derivative: Splittable in the sense of PD
    case BSEQ(r1, r2) => 
      if (brnullable(r1)) BALTS(S, List(BSEQ(brder(c, r1), r2), brder(c, r2) ) )//in r1\c~r2 r2's structure is maintained together with its splittablility bit
      else BSEQ(brder(c, r1), r2)
    case BSTAR(r) => BSEQ(brder(c, r), BSTAR(r))
  }
  def bflat(rs: List[BRexp]): List[BRexp] = {
    rs match {
      case Nil => Nil//TODO: Look at this! bs should have been exhausted one bit earlier.
      case BZERO :: rs1 => bflat(rs1)
      case BALTS(S, rs1) :: rs2 => rs1 ::: bflat(rs2)
      case BALTS(Z, rs1) :: rs2 => BALTS(Z, rs1) :: bflat(rs2)
      case r1 :: rs2 => r1 :: bflat(rs2)
    }
  }
  def stflat(rs: List[BRexp]): List[BRexp] = {
    //println("bs: " + bs + "rs: "+ rs + "length of bs, rs" + bs.length + ", " + rs.length)
    //assert(bs.length == rs.length - 1 || bs.length == rs.length)//bs == Nil, rs == Nil  May add early termination later.
    rs match {
      case Nil => Nil//TODO: Look at this! bs should have been exhausted one bit earlier.
      case BZERO :: rs1 => bflat(rs1)
      case BALTS(_, rs1) :: rs2 => rs1 ::: bflat(rs2)
      case r1 :: rs2 => r1 :: bflat(rs2)
    }
  }
  def berase(r:BRexp): Rexp = r match{
    case BZERO => ZERO
    case BONE => ONE
    case BCHAR(f) => CHAR(f)
    case BALTS(bs, rs) => ALTS(rs.map(berase(_)))
    case BSEQ(r1, r2) => SEQ (berase(r1), berase(r2))
    case BSTAR(r)=> STAR(berase(r))
  }
  def br_simp(r: BRexp): BRexp = r match {
    case BSEQ(r1, r2) => (br_simp(r1), br_simp(r2)) match {
        case (BZERO, _) => BZERO
        case (_, BZERO) => BZERO
        case (BONE, r2s) => r2s 
        case (r1s, r2s) => BSEQ(r1s, r2s)
    }
    case BALTS(bs, rs) => {
      //assert(bs.length == rs.length - 1)
      val dist_res = if(bs == S) {//all S
        val rs_simp = rs.map(br_simp)
        val flat_res = bflat(rs_simp)
        distinctBy(flat_res, berase)
      }
      else{//not allowed to simplify (all Z)
        rs
      }
      dist_res match {
        case Nil => {BZERO}
        case s :: Nil => { s}
        case rs => {BALTS(bs, rs)}
      }
    }
    //case BSTAR(r) => BSTAR(r)
    case r => r
  }

  def strong_br_simp(r: BRexp): BRexp = r match {
    case BSEQ(r1, r2) => (strong_br_simp(r1), strong_br_simp(r2)) match {
        case (BZERO, _) => BZERO
        case (_, BZERO) => BZERO
        case (BONE, r2s) => r2s 
        case (r1s, r2s) => BSEQ(r1s, r2s)
    }
    case BALTS(bs, rs) => {
      //assert(bs.length == rs.length - 1)
      val dist_res = {//all S
        val rs_simp = rs.map(strong_br_simp)
        val flat_res = stflat(rs_simp)
        distinctBy(flat_res, berase)
      }
      dist_res match {
        case Nil => {BZERO}
        case s :: Nil => { s}
        case rs => {BALTS(bs, rs)}
      }
    }
    //case BSTAR(r) => BSTAR(r)
    case r => r
  }
  //we want to bound the size by a function bspill s.t.
  //bspill(ders_simp(r,s)) ⊂  PD(r) 
  //and bspill is size-preserving (up to a constant factor)
  //so we bound |ders_simp(r,s)| by |PD(r)|
  //where we already have a nice bound for |PD(r)|: t^2*n^2 in Antimirov's paper

  //the function bspill converts a brexp into a list of rexps
  //the conversion mainly about this: (r1+r2)*r3*r4*.....rn -> {r1*r3*r4*....rn, r2*r3*r4*.....rn} 
  //but this is not always allowed
  //sometimes, we just leave the expression as it is: 
  //eg1: (a+b)*c -> {(a+b)*c}
  //eg2: r1+r2 -> {r1+r2} instead of {r1, r2}
  //why?
  //if we always return {a, b} when we encounter a+b, the property
  //bspill(ders_simp(r,s)) ⊂  PD(r) 
  //does not always hold
  //for instance 
  //if we call the bspill that always returns {r1, r2} when encountering r1+r2 "bspilli"
  //then bspilli( ders_simp( (c+cb)+c(a+b), c ) ) == bspilli(1+b+a) = {1, b, a}
  //However the letter a does not belong to PD( (c+cb)+c(a+b) )
  //then we can no longer say ders_simp(r,s)'s size is bounded by PD(r) because the former contains something the latter does not have
  //In order to make sure the inclusion holds, we have to find out why new terms appeared in the bspilli set that don't exist in the PD(r) set
  //Why a does not belong to PD((c+cb)+c(a+b))?
  //PD(r1+r2) = PD(r1) union PD(r2) => PD((c+cb)+c(a+b)) = PD(c+cb) union PD(c(a+b))
  //So the only possibility for PD to include a must be in the latter part of the regex, namely, c(a+b)
  //we have lemma that PD(r) = union of pders(s, r) where s is taken over all strings whose length does not exceed depth(r)
  //so PD(r) ⊂ pder(empty_string, r) union pder(c, r) union pder(ca, r) union pder(cb, r) where r = c(a+b)
  //RHS = {1} union pder(c, c(a+b))
  //Observe that pder(c, c(a+b)) == {a+b}
  //a and b are together, from the original regular expression (c+cb)+c(a+b).
  //but by our simplification, we first flattened this a+b into the same level with 1+b, then
  //removed duplicates of b, thereby destroying the structure in a+b and making this term a, instead of a+b
  //But in PD(r) we only have a+b, no a
  //one ad hoc solution might be to try this bspill(ders_simp(r,s)) ⊂  PD(r) union {all letters}
  //But this does not hold either according to experiment.
  //So we need to make sure the structure r1+r2 is not simplified away if it is from the original expression
  

  
  def bspill(r: BRexp): List[Rexp] = {
      r match {
          case BALTS(b, rs) => {
            if(b == S)
              rs.flatMap(bspill)
            else
              List(ALTS(rs.map(berase)))
          }
          case BSEQ(r2, r3) => bspill(r2).map( a => if(a == ONE) berase(r3) else SEQ(a, berase(r3)) )
          case BZERO => List()
          case r => List(berase(r))
      }
    
  }

}