diff -r b51d34113d47 -r 73f990bc6843 Brexp.scala --- a/Brexp.scala Thu May 07 11:36:15 2020 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,185 +0,0 @@ -import RexpRelated._ -import RexpRelated.Rexp._ -import Spiral._ -import scala.collection.mutable.ArrayBuffer -abstract class BRexp -case object BZERO extends BRexp -case object BONE extends BRexp -case class BCHAR(c: Char) extends BRexp -case class BALTS(b: Bit, rs: List[BRexp]) extends BRexp -case class BSEQ(r1: BRexp, r2: BRexp) extends BRexp -case class BSTAR(r: BRexp) extends BRexp - -object BRexp{ - def brternalise(r: Rexp) : BRexp = r match {//remember the initial shadowed bit should be set to Z as there - //are no enclosing STAR or SEQ - case ZERO => BZERO - case ONE => BONE - case CHAR(c) => BCHAR(c) - case ALTS(rs) => BALTS(Z, rs.map(brternalise)) - case SEQ(r1, r2) => BSEQ( brternalise(r1), brternalise(r2) ) - case STAR(r) => BSTAR(brternalise(r)) - case RECD(x, r) => brternalise(r) - } - def brnullable (r: BRexp) : Boolean = r match { - case BZERO => false - case BONE => true - case BCHAR(_) => false - case BALTS(bs, rs) => rs.exists(brnullable) - case BSEQ(r1, r2) => brnullable(r1) && brnullable(r2) - case BSTAR(_) => true - } - //this function tells bspill when converting a brexp into a list of rexps - //the conversion : (a+b)*c -> {a*c, b*c} can only happen when a+b is generated from scratch (e.g. when deriving a seq) - //or is from the original regex but have been "touched" (i.e. have been derived) - //Why make this distinction? Look at the following example: - //r = (c+cb)+c(a+b) - // r\c = (1+b)+(a+b) - //after simplification - // (r\c)simp= 1+b+a - //we lost the structure that tells us 1+b should be grouped together and a grouped as itself - //however in our brexp simplification, - //(r\c)br_simp = 1+b+(a+b) - //we do not allow the bracket to be opened as it is from the original expression and have not been touched - def brder(c: Char, r: BRexp) : BRexp = r match { - case BZERO => BZERO - case BONE => BZERO - case BCHAR(d) => if (c == d) BONE else BZERO - case BALTS(bs, rs) => BALTS(S, rs.map(brder(c, _)))//After derivative: Splittable in the sense of PD - case BSEQ(r1, r2) => - if (brnullable(r1)) BALTS(S, List(BSEQ(brder(c, r1), r2), brder(c, r2) ) )//in r1\c~r2 r2's structure is maintained together with its splittablility bit - else BSEQ(brder(c, r1), r2) - case BSTAR(r) => BSEQ(brder(c, r), BSTAR(r)) - } - def bflat(rs: List[BRexp]): List[BRexp] = { - rs match { - case Nil => Nil//TODO: Look at this! bs should have been exhausted one bit earlier. - case BZERO :: rs1 => bflat(rs1) - case BALTS(S, rs1) :: rs2 => rs1 ::: bflat(rs2) - case BALTS(Z, rs1) :: rs2 => BALTS(Z, rs1) :: bflat(rs2) - case r1 :: rs2 => r1 :: bflat(rs2) - } - } - def stflat(rs: List[BRexp]): List[BRexp] = { - //println("bs: " + bs + "rs: "+ rs + "length of bs, rs" + bs.length + ", " + rs.length) - //assert(bs.length == rs.length - 1 || bs.length == rs.length)//bs == Nil, rs == Nil May add early termination later. - rs match { - case Nil => Nil//TODO: Look at this! bs should have been exhausted one bit earlier. - case BZERO :: rs1 => bflat(rs1) - case BALTS(_, rs1) :: rs2 => rs1 ::: bflat(rs2) - case r1 :: rs2 => r1 :: bflat(rs2) - } - } - def berase(r:BRexp): Rexp = r match{ - case BZERO => ZERO - case BONE => ONE - case BCHAR(f) => CHAR(f) - case BALTS(bs, rs) => ALTS(rs.map(berase(_))) - case BSEQ(r1, r2) => SEQ (berase(r1), berase(r2)) - case BSTAR(r)=> STAR(berase(r)) - } - def br_simp(r: BRexp): BRexp = r match { - case BSEQ(r1, r2) => (br_simp(r1), br_simp(r2)) match { - case (BZERO, _) => BZERO - case (_, BZERO) => BZERO - case (BONE, r2s) => r2s - case (r1s, r2s) => BSEQ(r1s, r2s) - } - case BALTS(bs, rs) => { - //assert(bs.length == rs.length - 1) - val dist_res = if(bs == S) {//all S - val rs_simp = rs.map(br_simp) - val flat_res = bflat(rs_simp) - distinctBy(flat_res, berase) - } - else{//not allowed to simplify (all Z) - rs - } - dist_res match { - case Nil => {BZERO} - case s :: Nil => { s} - case rs => {BALTS(bs, rs)} - } - } - //case BSTAR(r) => BSTAR(r) - case r => r - } - - def strong_br_simp(r: BRexp): BRexp = r match { - case BSEQ(r1, r2) => (strong_br_simp(r1), strong_br_simp(r2)) match { - case (BZERO, _) => BZERO - case (_, BZERO) => BZERO - case (BONE, r2s) => r2s - case (r1s, r2s) => BSEQ(r1s, r2s) - } - case BALTS(bs, rs) => { - //assert(bs.length == rs.length - 1) - val dist_res = {//all S - val rs_simp = rs.map(strong_br_simp) - val flat_res = stflat(rs_simp) - distinctBy(flat_res, berase) - } - dist_res match { - case Nil => {BZERO} - case s :: Nil => { s} - case rs => {BALTS(bs, rs)} - } - } - case BSTAR(r) => BSTAR(strong_br_simp(r)) - case r => r - } - //we want to bound the size by a function bspill s.t. - //bspill(ders_simp(r,s)) ⊂ PD(r) - //and bspill is size-preserving (up to a constant factor) - //so we bound |ders_simp(r,s)| by |PD(r)| - //where we already have a nice bound for |PD(r)|: t^2*n^2 in Antimirov's paper - - //the function bspill converts a brexp into a list of rexps - //the conversion mainly about this: (r1+r2)*r3*r4*.....rn -> {r1*r3*r4*....rn, r2*r3*r4*.....rn} - //but this is not always allowed - //sometimes, we just leave the expression as it is: - //eg1: (a+b)*c -> {(a+b)*c} - //eg2: r1+r2 -> {r1+r2} instead of {r1, r2} - //why? - //if we always return {a, b} when we encounter a+b, the property - //bspill(ders_simp(r,s)) ⊂ PD(r) - //does not always hold - //for instance - //if we call the bspill that always returns {r1, r2} when encountering r1+r2 "bspilli" - //then bspilli( ders_simp( (c+cb)+c(a+b), c ) ) == bspilli(1+b+a) = {1, b, a} - //However the letter a does not belong to PD( (c+cb)+c(a+b) ) - //then we can no longer say ders_simp(r,s)'s size is bounded by PD(r) because the former contains something the latter does not have - //In order to make sure the inclusion holds, we have to find out why new terms appeared in the bspilli set that don't exist in the PD(r) set - //Why a does not belong to PD((c+cb)+c(a+b))? - //PD(r1+r2) = PD(r1) union PD(r2) => PD((c+cb)+c(a+b)) = PD(c+cb) union PD(c(a+b)) - //So the only possibility for PD to include a must be in the latter part of the regex, namely, c(a+b) - //we have lemma that PD(r) = union of pders(s, r) where s is taken over all strings whose length does not exceed depth(r) - //so PD(r) ⊂ pder(empty_string, r) union pder(c, r) union pder(ca, r) union pder(cb, r) where r = c(a+b) - //RHS = {1} union pder(c, c(a+b)) - //Observe that pder(c, c(a+b)) == {a+b} - //a and b are together, from the original regular expression (c+cb)+c(a+b). - //but by our simplification, we first flattened this a+b into the same level with 1+b, then - //removed duplicates of b, thereby destroying the structure in a+b and making this term a, instead of a+b - //But in PD(r) we only have a+b, no a - //one ad hoc solution might be to try this bspill(ders_simp(r,s)) ⊂ PD(r) union {all letters} - //But this does not hold either according to experiment. - //So we need to make sure the structure r1+r2 is not simplified away if it is from the original expression - - - - def bspill(r: BRexp): List[Rexp] = { - r match { - case BALTS(b, rs) => { - if(b == S) - rs.flatMap(bspill) - else - List(ALTS(rs.map(berase))) - } - case BSEQ(r2, r3) => bspill(r2).map( a => if(a == ONE) berase(r3) else SEQ(a, berase(r3)) ) - case BZERO => List() - case r => List(berase(r)) - } - - } - -} \ No newline at end of file