Brexp.scala
author Chengsong
Sat, 16 Mar 2019 20:05:13 +0000
changeset 7 1572760ff866
parent 5 622ddbb1223a
permissions -rwxr-xr-x
found the difference: caused by flats in ders_simp, the alts is at top most level , so no fuse and bits stay at the alts level whereas in ders + singele simp, the alts that should be the final top-level alts is not at the topmost level initially before simplification so it is opened up and bits fused. later it finds out itself the top level only aalts remaining, but the fuse is not reversible we do not know what happened either.

import RexpRelated._
import RexpRelated.Rexp._
import Spiral._
import scala.collection.mutable.ArrayBuffer
abstract class BRexp
case object BZERO extends BRexp
case object BONE extends BRexp
case class BCHAR(c: Char) extends BRexp
case class BALTS(b: Bit, rs: List[BRexp]) extends BRexp 
case class BSEQ(r1: BRexp, r2: BRexp) extends BRexp 
case class BSTAR(r: BRexp) extends BRexp 

object BRexp{
  def brternalise(r: Rexp) : BRexp = r match {//remember the initial shadowed bit should be set to Z as there 
  //are no enclosing STAR or SEQ
    case ZERO => BZERO
    case ONE => BONE
    case CHAR(c) => BCHAR(c)
    case ALTS(rs) => BALTS(Z,  rs.map(brternalise))
    case SEQ(r1, r2) => BSEQ( brternalise(r1), brternalise(r2) )
    case STAR(r) => BSTAR(brternalise(r))
    case RECD(x, r) => brternalise(r)
  }
  def brnullable (r: BRexp) : Boolean = r match {
    case BZERO => false
    case BONE => true
    case BCHAR(_) => false
    case BALTS(bs, rs) => rs.exists(brnullable)
    case BSEQ(r1, r2) => brnullable(r1) && brnullable(r2)
    case BSTAR(_) => true
  }
  //this function tells bspill when converting a brexp into a list of rexps
  //the conversion : (a+b)*c -> {a*c, b*c} can only happen when a+b is generated from scratch (e.g. when deriving a seq)
  //or is from the original regex but have been "touched" (i.e. have been derived)
  //Why make this distinction? Look at the following example:
  //r = (c+cb)+c(a+b)
  // r\c = (1+b)+(a+b)
  //after simplification
  // (r\c)simp= 1+b+a
  //we lost the structure that tells us 1+b should be grouped together and a grouped as itself
  //however in our brexp simplification, 
  //(r\c)br_simp = 1+b+(a+b)
  //we do not allow the bracket to be opened as it is from the original expression and have not been touched
  def brder(c: Char, r: BRexp) : BRexp = r match {
    case BZERO => BZERO
    case BONE => BZERO
    case BCHAR(d) => if (c == d) BONE else BZERO
    case BALTS(bs, rs) => BALTS(S, rs.map(brder(c, _)))//After derivative: Splittable in the sense of PD
    case BSEQ(r1, r2) => 
      if (brnullable(r1)) BALTS(S, List(BSEQ(brder(c, r1), r2), brder(c, r2) ) )//in r1\c~r2 r2's structure is maintained together with its splittablility bit
      else BSEQ(brder(c, r1), r2)
    case BSTAR(r) => BSEQ(brder(c, r), BSTAR(r))
  }
  def bflat(rs: List[BRexp]): List[BRexp] = {
    rs match {
      case Nil => Nil//TODO: Look at this! bs should have been exhausted one bit earlier.
      case BZERO :: rs1 => bflat(rs1)
      case BALTS(S, rs1) :: rs2 => rs1 ::: bflat(rs2)
      case BALTS(Z, rs1) :: rs2 => BALTS(Z, rs1) :: bflat(rs2)
      case r1 :: rs2 => r1 :: bflat(rs2)
    }
  }
  def stflat(rs: List[BRexp]): List[BRexp] = {
    //println("bs: " + bs + "rs: "+ rs + "length of bs, rs" + bs.length + ", " + rs.length)
    //assert(bs.length == rs.length - 1 || bs.length == rs.length)//bs == Nil, rs == Nil  May add early termination later.
    rs match {
      case Nil => Nil//TODO: Look at this! bs should have been exhausted one bit earlier.
      case BZERO :: rs1 => bflat(rs1)
      case BALTS(_, rs1) :: rs2 => rs1 ::: bflat(rs2)
      case r1 :: rs2 => r1 :: bflat(rs2)
    }
  }
  def berase(r:BRexp): Rexp = r match{
    case BZERO => ZERO
    case BONE => ONE
    case BCHAR(f) => CHAR(f)
    case BALTS(bs, rs) => ALTS(rs.map(berase(_)))
    case BSEQ(r1, r2) => SEQ (berase(r1), berase(r2))
    case BSTAR(r)=> STAR(berase(r))
  }
  def br_simp(r: BRexp): BRexp = r match {
    case BSEQ(r1, r2) => (br_simp(r1), br_simp(r2)) match {
        case (BZERO, _) => BZERO
        case (_, BZERO) => BZERO
        case (BONE, r2s) => r2s 
        case (r1s, r2s) => BSEQ(r1s, r2s)
    }
    case BALTS(bs, rs) => {
      //assert(bs.length == rs.length - 1)
      val dist_res = if(bs == S) {//all S
        val rs_simp = rs.map(br_simp)
        val flat_res = bflat(rs_simp)
        distinctBy(flat_res, berase)
      }
      else{//not allowed to simplify (all Z)
        rs
      }
      dist_res match {
        case Nil => {BZERO}
        case s :: Nil => { s}
        case rs => {BALTS(bs, rs)}
      }
    }
    //case BSTAR(r) => BSTAR(r)
    case r => r
  }

  def strong_br_simp(r: BRexp): BRexp = r match {
    case BSEQ(r1, r2) => (strong_br_simp(r1), strong_br_simp(r2)) match {
        case (BZERO, _) => BZERO
        case (_, BZERO) => BZERO
        case (BONE, r2s) => r2s 
        case (r1s, r2s) => BSEQ(r1s, r2s)
    }
    case BALTS(bs, rs) => {
      //assert(bs.length == rs.length - 1)
      val dist_res = {//all S
        val rs_simp = rs.map(strong_br_simp)
        val flat_res = stflat(rs_simp)
        distinctBy(flat_res, berase)
      }
      dist_res match {
        case Nil => {BZERO}
        case s :: Nil => { s}
        case rs => {BALTS(bs, rs)}
      }
    }
    case BSTAR(r) => BSTAR(strong_br_simp(r))
    case r => r
  }
  //we want to bound the size by a function bspill s.t.
  //bspill(ders_simp(r,s)) ⊂  PD(r) 
  //and bspill is size-preserving (up to a constant factor)
  //so we bound |ders_simp(r,s)| by |PD(r)|
  //where we already have a nice bound for |PD(r)|: t^2*n^2 in Antimirov's paper

  //the function bspill converts a brexp into a list of rexps
  //the conversion mainly about this: (r1+r2)*r3*r4*.....rn -> {r1*r3*r4*....rn, r2*r3*r4*.....rn} 
  //but this is not always allowed
  //sometimes, we just leave the expression as it is: 
  //eg1: (a+b)*c -> {(a+b)*c}
  //eg2: r1+r2 -> {r1+r2} instead of {r1, r2}
  //why?
  //if we always return {a, b} when we encounter a+b, the property
  //bspill(ders_simp(r,s)) ⊂  PD(r) 
  //does not always hold
  //for instance 
  //if we call the bspill that always returns {r1, r2} when encountering r1+r2 "bspilli"
  //then bspilli( ders_simp( (c+cb)+c(a+b), c ) ) == bspilli(1+b+a) = {1, b, a}
  //However the letter a does not belong to PD( (c+cb)+c(a+b) )
  //then we can no longer say ders_simp(r,s)'s size is bounded by PD(r) because the former contains something the latter does not have
  //In order to make sure the inclusion holds, we have to find out why new terms appeared in the bspilli set that don't exist in the PD(r) set
  //Why a does not belong to PD((c+cb)+c(a+b))?
  //PD(r1+r2) = PD(r1) union PD(r2) => PD((c+cb)+c(a+b)) = PD(c+cb) union PD(c(a+b))
  //So the only possibility for PD to include a must be in the latter part of the regex, namely, c(a+b)
  //we have lemma that PD(r) = union of pders(s, r) where s is taken over all strings whose length does not exceed depth(r)
  //so PD(r) ⊂ pder(empty_string, r) union pder(c, r) union pder(ca, r) union pder(cb, r) where r = c(a+b)
  //RHS = {1} union pder(c, c(a+b))
  //Observe that pder(c, c(a+b)) == {a+b}
  //a and b are together, from the original regular expression (c+cb)+c(a+b).
  //but by our simplification, we first flattened this a+b into the same level with 1+b, then
  //removed duplicates of b, thereby destroying the structure in a+b and making this term a, instead of a+b
  //But in PD(r) we only have a+b, no a
  //one ad hoc solution might be to try this bspill(ders_simp(r,s)) ⊂  PD(r) union {all letters}
  //But this does not hold either according to experiment.
  //So we need to make sure the structure r1+r2 is not simplified away if it is from the original expression
  

  
  def bspill(r: BRexp): List[Rexp] = {
      r match {
          case BALTS(b, rs) => {
            if(b == S)
              rs.flatMap(bspill)
            else
              List(ALTS(rs.map(berase)))
          }
          case BSEQ(r2, r3) => bspill(r2).map( a => if(a == ONE) berase(r3) else SEQ(a, berase(r3)) )
          case BZERO => List()
          case r => List(berase(r))
      }
    
  }

}