found the difference: caused by flats
in ders_simp, the alts is at top most level , so no fuse and bits stay at the alts level
whereas in ders + singele simp, the alts that should be the final top-level alts is not at the topmost level initially before simplification
so it is opened up and bits fused. later it finds out itself the top level only aalts remaining, but the fuse is not reversible
we do not know what happened either.
import RexpRelated._
import RexpRelated.Rexp._
import Spiral._
import scala.collection.mutable.ArrayBuffer
abstract class BRexp
case object BZERO extends BRexp
case object BONE extends BRexp
case class BCHAR(c: Char) extends BRexp
case class BALTS(b: Bit, rs: List[BRexp]) extends BRexp
case class BSEQ(r1: BRexp, r2: BRexp) extends BRexp
case class BSTAR(r: BRexp) extends BRexp
object BRexp{
def brternalise(r: Rexp) : BRexp = r match {//remember the initial shadowed bit should be set to Z as there
//are no enclosing STAR or SEQ
case ZERO => BZERO
case ONE => BONE
case CHAR(c) => BCHAR(c)
case ALTS(rs) => BALTS(Z, rs.map(brternalise))
case SEQ(r1, r2) => BSEQ( brternalise(r1), brternalise(r2) )
case STAR(r) => BSTAR(brternalise(r))
case RECD(x, r) => brternalise(r)
}
def brnullable (r: BRexp) : Boolean = r match {
case BZERO => false
case BONE => true
case BCHAR(_) => false
case BALTS(bs, rs) => rs.exists(brnullable)
case BSEQ(r1, r2) => brnullable(r1) && brnullable(r2)
case BSTAR(_) => true
}
//this function tells bspill when converting a brexp into a list of rexps
//the conversion : (a+b)*c -> {a*c, b*c} can only happen when a+b is generated from scratch (e.g. when deriving a seq)
//or is from the original regex but have been "touched" (i.e. have been derived)
//Why make this distinction? Look at the following example:
//r = (c+cb)+c(a+b)
// r\c = (1+b)+(a+b)
//after simplification
// (r\c)simp= 1+b+a
//we lost the structure that tells us 1+b should be grouped together and a grouped as itself
//however in our brexp simplification,
//(r\c)br_simp = 1+b+(a+b)
//we do not allow the bracket to be opened as it is from the original expression and have not been touched
def brder(c: Char, r: BRexp) : BRexp = r match {
case BZERO => BZERO
case BONE => BZERO
case BCHAR(d) => if (c == d) BONE else BZERO
case BALTS(bs, rs) => BALTS(S, rs.map(brder(c, _)))//After derivative: Splittable in the sense of PD
case BSEQ(r1, r2) =>
if (brnullable(r1)) BALTS(S, List(BSEQ(brder(c, r1), r2), brder(c, r2) ) )//in r1\c~r2 r2's structure is maintained together with its splittablility bit
else BSEQ(brder(c, r1), r2)
case BSTAR(r) => BSEQ(brder(c, r), BSTAR(r))
}
def bflat(rs: List[BRexp]): List[BRexp] = {
rs match {
case Nil => Nil//TODO: Look at this! bs should have been exhausted one bit earlier.
case BZERO :: rs1 => bflat(rs1)
case BALTS(S, rs1) :: rs2 => rs1 ::: bflat(rs2)
case BALTS(Z, rs1) :: rs2 => BALTS(Z, rs1) :: bflat(rs2)
case r1 :: rs2 => r1 :: bflat(rs2)
}
}
def stflat(rs: List[BRexp]): List[BRexp] = {
//println("bs: " + bs + "rs: "+ rs + "length of bs, rs" + bs.length + ", " + rs.length)
//assert(bs.length == rs.length - 1 || bs.length == rs.length)//bs == Nil, rs == Nil May add early termination later.
rs match {
case Nil => Nil//TODO: Look at this! bs should have been exhausted one bit earlier.
case BZERO :: rs1 => bflat(rs1)
case BALTS(_, rs1) :: rs2 => rs1 ::: bflat(rs2)
case r1 :: rs2 => r1 :: bflat(rs2)
}
}
def berase(r:BRexp): Rexp = r match{
case BZERO => ZERO
case BONE => ONE
case BCHAR(f) => CHAR(f)
case BALTS(bs, rs) => ALTS(rs.map(berase(_)))
case BSEQ(r1, r2) => SEQ (berase(r1), berase(r2))
case BSTAR(r)=> STAR(berase(r))
}
def br_simp(r: BRexp): BRexp = r match {
case BSEQ(r1, r2) => (br_simp(r1), br_simp(r2)) match {
case (BZERO, _) => BZERO
case (_, BZERO) => BZERO
case (BONE, r2s) => r2s
case (r1s, r2s) => BSEQ(r1s, r2s)
}
case BALTS(bs, rs) => {
//assert(bs.length == rs.length - 1)
val dist_res = if(bs == S) {//all S
val rs_simp = rs.map(br_simp)
val flat_res = bflat(rs_simp)
distinctBy(flat_res, berase)
}
else{//not allowed to simplify (all Z)
rs
}
dist_res match {
case Nil => {BZERO}
case s :: Nil => { s}
case rs => {BALTS(bs, rs)}
}
}
//case BSTAR(r) => BSTAR(r)
case r => r
}
def strong_br_simp(r: BRexp): BRexp = r match {
case BSEQ(r1, r2) => (strong_br_simp(r1), strong_br_simp(r2)) match {
case (BZERO, _) => BZERO
case (_, BZERO) => BZERO
case (BONE, r2s) => r2s
case (r1s, r2s) => BSEQ(r1s, r2s)
}
case BALTS(bs, rs) => {
//assert(bs.length == rs.length - 1)
val dist_res = {//all S
val rs_simp = rs.map(strong_br_simp)
val flat_res = stflat(rs_simp)
distinctBy(flat_res, berase)
}
dist_res match {
case Nil => {BZERO}
case s :: Nil => { s}
case rs => {BALTS(bs, rs)}
}
}
case BSTAR(r) => BSTAR(strong_br_simp(r))
case r => r
}
//we want to bound the size by a function bspill s.t.
//bspill(ders_simp(r,s)) ⊂ PD(r)
//and bspill is size-preserving (up to a constant factor)
//so we bound |ders_simp(r,s)| by |PD(r)|
//where we already have a nice bound for |PD(r)|: t^2*n^2 in Antimirov's paper
//the function bspill converts a brexp into a list of rexps
//the conversion mainly about this: (r1+r2)*r3*r4*.....rn -> {r1*r3*r4*....rn, r2*r3*r4*.....rn}
//but this is not always allowed
//sometimes, we just leave the expression as it is:
//eg1: (a+b)*c -> {(a+b)*c}
//eg2: r1+r2 -> {r1+r2} instead of {r1, r2}
//why?
//if we always return {a, b} when we encounter a+b, the property
//bspill(ders_simp(r,s)) ⊂ PD(r)
//does not always hold
//for instance
//if we call the bspill that always returns {r1, r2} when encountering r1+r2 "bspilli"
//then bspilli( ders_simp( (c+cb)+c(a+b), c ) ) == bspilli(1+b+a) = {1, b, a}
//However the letter a does not belong to PD( (c+cb)+c(a+b) )
//then we can no longer say ders_simp(r,s)'s size is bounded by PD(r) because the former contains something the latter does not have
//In order to make sure the inclusion holds, we have to find out why new terms appeared in the bspilli set that don't exist in the PD(r) set
//Why a does not belong to PD((c+cb)+c(a+b))?
//PD(r1+r2) = PD(r1) union PD(r2) => PD((c+cb)+c(a+b)) = PD(c+cb) union PD(c(a+b))
//So the only possibility for PD to include a must be in the latter part of the regex, namely, c(a+b)
//we have lemma that PD(r) = union of pders(s, r) where s is taken over all strings whose length does not exceed depth(r)
//so PD(r) ⊂ pder(empty_string, r) union pder(c, r) union pder(ca, r) union pder(cb, r) where r = c(a+b)
//RHS = {1} union pder(c, c(a+b))
//Observe that pder(c, c(a+b)) == {a+b}
//a and b are together, from the original regular expression (c+cb)+c(a+b).
//but by our simplification, we first flattened this a+b into the same level with 1+b, then
//removed duplicates of b, thereby destroying the structure in a+b and making this term a, instead of a+b
//But in PD(r) we only have a+b, no a
//one ad hoc solution might be to try this bspill(ders_simp(r,s)) ⊂ PD(r) union {all letters}
//But this does not hold either according to experiment.
//So we need to make sure the structure r1+r2 is not simplified away if it is from the original expression
def bspill(r: BRexp): List[Rexp] = {
r match {
case BALTS(b, rs) => {
if(b == S)
rs.flatMap(bspill)
else
List(ALTS(rs.map(berase)))
}
case BSEQ(r2, r3) => bspill(r2).map( a => if(a == ONE) berase(r3) else SEQ(a, berase(r3)) )
case BZERO => List()
case r => List(berase(r))
}
}
}