diff -r 99f4459d9bb6 -r cf169411b771 Brexp.scala --- a/Brexp.scala Wed Mar 13 13:33:54 2019 +0000 +++ b/Brexp.scala Wed Mar 13 15:27:09 2019 +0000 @@ -52,8 +52,6 @@ case BSTAR(r) => BSEQ(brder(c, r), BSTAR(r)) } def bflat(rs: List[BRexp]): List[BRexp] = { - //println("bs: " + bs + "rs: "+ rs + "length of bs, rs" + bs.length + ", " + rs.length) - //assert(bs.length == rs.length - 1 || bs.length == rs.length)//bs == Nil, rs == Nil May add early termination later. rs match { case Nil => Nil//TODO: Look at this! bs should have been exhausted one bit earlier. case BZERO :: rs1 => bflat(rs1) @@ -130,31 +128,52 @@ //case BSTAR(r) => BSTAR(r) case r => r } + //we want to bound the size by a function bspill s.t. + //bspill(ders_simp(r,s)) ⊂ PD(r) + //and bspill is size-preserving (up to a constant factor) + //so we bound |ders_simp(r,s)| by |PD(r)| + //where we already have a nice bound for |PD(r)|: t^2*n^2 in Antimirov's paper - def break_chain(bs: Bit, rs: List[BRexp]): List[Rexp] = { - //do it in a functional style - if(bs == S) - rs.flatMap(bspill) - else - List(ALTS(rs.map(berase))) - } + //the function bspill converts a brexp into a list of rexps + //the conversion mainly about this: (r1+r2)*r3*r4*.....rn -> {r1*r3*r4*....rn, r2*r3*r4*.....rn} + //but this is not always allowed + //sometimes, we just leave the expression as it is: + //eg1: (a+b)*c -> {(a+b)*c} + //eg2: r1+r2 -> {r1+r2} instead of {r1, r2} + //why? + //if we always return {a, b} when we encounter a+b, the property + //bspill(ders_simp(r,s)) ⊂ PD(r) + //does not always hold + //for instance + //if we call the bspill that always returns {r1, r2} when encountering r1+r2 "bspilli" + //then bspilli( ders_simp( (c+cb)+c(a+b), c ) ) == bspilli(1+b+a) = {1, b, a} + //However the letter a does not belong to PD( (c+cb)+c(a+b) ) + //then we can no longer say ders_simp(r,s)'s size is bounded by PD(r) because the former contains something the latter does not have + //In order to make sure the inclusion holds, we have to find out why new terms appeared in the bspilli set that don't exist in the PD(r) set + //Why a does not belong to PD((c+cb)+c(a+b))? + //PD(r1+r2) = PD(r1) union PD(r2) => PD((c+cb)+c(a+b)) = PD(c+cb) union PD(c(a+b)) + //So the only possibility for PD to include a must be in the latter part of the regex, namely, c(a+b) + //we have lemma that PD(r) = union of pders(s, r) where s is taken over all strings whose length does not exceed depth(r) + //so PD(r) ⊂ pder(empty_string, r) union pder(c, r) union pder(ca, r) union pder(cb, r) where r = c(a+b) + //RHS = {1} union pder(c, c(a+b)) + //Observe that pder(c, c(a+b)) == {a+b} + //a and b are together, from the original regular expression (c+cb)+c(a+b). + //but by our simplification, we first flattened this a+b into the same level with 1+b, then + //removed duplicates of b, thereby destroying the structure in a+b and making this term a, instead of a+b + //But in PD(r) we only have a+b, no a + //one ad hoc solution might be to try this bspill(ders_simp(r,s)) ⊂ PD(r) union {all letters} + //But this does not hold either according to experiment. + //So we need to make sure the structure r1+r2 is not simplified away if it is from the original expression - //this function converts a brexp into a list of rexps - //the conversion mainly about this: (a+b)*c -> {a*c, b*c} if a+b is generated from scratch (e.g. when deriving a seq) - //or is from the original regex but have been "touched" (i.e. have been derived) - //Why make this distinction? Look at the following example: - //r = (c+cb)+c(a+b) - // r\c = (1+b)+(a+b) - //after simplification - // (r\c)simp= 1+b+a - //we lost the structure that tells us 1+b should be grouped together and a grouped as itself - //however in our brexp simplification, - //(r\c)br_simp = 1+b+(a+b) - //we do not allow the bracket to be opened as it is from the original expression and have not been touched + + def bspill(r: BRexp): List[Rexp] = { r match { - case BALTS(bs, rs) => { - break_chain(bs, rs) + case BALTS(b, rs) => { + if(b == S) + rs.flatMap(bspill) + else + List(ALTS(rs.map(berase))) } case BSEQ(r2, r3) => bspill(r2).map( a => if(a == ONE) berase(r3) else SEQ(a, berase(r3)) ) case BZERO => List()