0
+ − 1
import RexpRelated._
+ − 2
import RexpRelated.Rexp._
+ − 3
import Spiral._
+ − 4
import scala.collection.mutable.ArrayBuffer
+ − 5
abstract class BRexp
+ − 6
case object BZERO extends BRexp
+ − 7
case object BONE extends BRexp
+ − 8
case class BCHAR(c: Char) extends BRexp
+ − 9
case class BALTS(b: Bit, rs: List[BRexp]) extends BRexp
+ − 10
case class BSEQ(r1: BRexp, r2: BRexp) extends BRexp
+ − 11
case class BSTAR(r: BRexp) extends BRexp
+ − 12
+ − 13
object BRexp{
+ − 14
def brternalise(r: Rexp) : BRexp = r match {//remember the initial shadowed bit should be set to Z as there
+ − 15
//are no enclosing STAR or SEQ
+ − 16
case ZERO => BZERO
+ − 17
case ONE => BONE
+ − 18
case CHAR(c) => BCHAR(c)
+ − 19
case ALTS(rs) => BALTS(Z, rs.map(brternalise))
+ − 20
case SEQ(r1, r2) => BSEQ( brternalise(r1), brternalise(r2) )
+ − 21
case STAR(r) => BSTAR(brternalise(r))
+ − 22
case RECD(x, r) => brternalise(r)
+ − 23
}
+ − 24
def brnullable (r: BRexp) : Boolean = r match {
+ − 25
case BZERO => false
+ − 26
case BONE => true
+ − 27
case BCHAR(_) => false
+ − 28
case BALTS(bs, rs) => rs.exists(brnullable)
+ − 29
case BSEQ(r1, r2) => brnullable(r1) && brnullable(r2)
+ − 30
case BSTAR(_) => true
+ − 31
}
1
+ − 32
//this function tells bspill when converting a brexp into a list of rexps
+ − 33
//the conversion : (a+b)*c -> {a*c, b*c} can only happen when a+b is generated from scratch (e.g. when deriving a seq)
+ − 34
//or is from the original regex but have been "touched" (i.e. have been derived)
+ − 35
//Why make this distinction? Look at the following example:
+ − 36
//r = (c+cb)+c(a+b)
+ − 37
// r\c = (1+b)+(a+b)
+ − 38
//after simplification
+ − 39
// (r\c)simp= 1+b+a
+ − 40
//we lost the structure that tells us 1+b should be grouped together and a grouped as itself
+ − 41
//however in our brexp simplification,
+ − 42
//(r\c)br_simp = 1+b+(a+b)
+ − 43
//we do not allow the bracket to be opened as it is from the original expression and have not been touched
0
+ − 44
def brder(c: Char, r: BRexp) : BRexp = r match {
+ − 45
case BZERO => BZERO
+ − 46
case BONE => BZERO
+ − 47
case BCHAR(d) => if (c == d) BONE else BZERO
+ − 48
case BALTS(bs, rs) => BALTS(S, rs.map(brder(c, _)))//After derivative: Splittable in the sense of PD
+ − 49
case BSEQ(r1, r2) =>
+ − 50
if (brnullable(r1)) BALTS(S, List(BSEQ(brder(c, r1), r2), brder(c, r2) ) )//in r1\c~r2 r2's structure is maintained together with its splittablility bit
+ − 51
else BSEQ(brder(c, r1), r2)
+ − 52
case BSTAR(r) => BSEQ(brder(c, r), BSTAR(r))
+ − 53
}
+ − 54
def bflat(rs: List[BRexp]): List[BRexp] = {
+ − 55
rs match {
+ − 56
case Nil => Nil//TODO: Look at this! bs should have been exhausted one bit earlier.
+ − 57
case BZERO :: rs1 => bflat(rs1)
+ − 58
case BALTS(S, rs1) :: rs2 => rs1 ::: bflat(rs2)
+ − 59
case BALTS(Z, rs1) :: rs2 => BALTS(Z, rs1) :: bflat(rs2)
+ − 60
case r1 :: rs2 => r1 :: bflat(rs2)
+ − 61
}
+ − 62
}
+ − 63
def stflat(rs: List[BRexp]): List[BRexp] = {
+ − 64
//println("bs: " + bs + "rs: "+ rs + "length of bs, rs" + bs.length + ", " + rs.length)
+ − 65
//assert(bs.length == rs.length - 1 || bs.length == rs.length)//bs == Nil, rs == Nil May add early termination later.
+ − 66
rs match {
+ − 67
case Nil => Nil//TODO: Look at this! bs should have been exhausted one bit earlier.
+ − 68
case BZERO :: rs1 => bflat(rs1)
+ − 69
case BALTS(_, rs1) :: rs2 => rs1 ::: bflat(rs2)
+ − 70
case r1 :: rs2 => r1 :: bflat(rs2)
+ − 71
}
+ − 72
}
+ − 73
def berase(r:BRexp): Rexp = r match{
+ − 74
case BZERO => ZERO
+ − 75
case BONE => ONE
+ − 76
case BCHAR(f) => CHAR(f)
+ − 77
case BALTS(bs, rs) => ALTS(rs.map(berase(_)))
+ − 78
case BSEQ(r1, r2) => SEQ (berase(r1), berase(r2))
+ − 79
case BSTAR(r)=> STAR(berase(r))
+ − 80
}
+ − 81
def br_simp(r: BRexp): BRexp = r match {
+ − 82
case BSEQ(r1, r2) => (br_simp(r1), br_simp(r2)) match {
+ − 83
case (BZERO, _) => BZERO
+ − 84
case (_, BZERO) => BZERO
+ − 85
case (BONE, r2s) => r2s
+ − 86
case (r1s, r2s) => BSEQ(r1s, r2s)
+ − 87
}
+ − 88
case BALTS(bs, rs) => {
+ − 89
//assert(bs.length == rs.length - 1)
+ − 90
val dist_res = if(bs == S) {//all S
+ − 91
val rs_simp = rs.map(br_simp)
+ − 92
val flat_res = bflat(rs_simp)
+ − 93
distinctBy(flat_res, berase)
+ − 94
}
+ − 95
else{//not allowed to simplify (all Z)
+ − 96
rs
+ − 97
}
+ − 98
dist_res match {
+ − 99
case Nil => {BZERO}
+ − 100
case s :: Nil => { s}
+ − 101
case rs => {BALTS(bs, rs)}
+ − 102
}
+ − 103
}
+ − 104
//case BSTAR(r) => BSTAR(r)
+ − 105
case r => r
+ − 106
}
+ − 107
+ − 108
def strong_br_simp(r: BRexp): BRexp = r match {
+ − 109
case BSEQ(r1, r2) => (strong_br_simp(r1), strong_br_simp(r2)) match {
+ − 110
case (BZERO, _) => BZERO
+ − 111
case (_, BZERO) => BZERO
+ − 112
case (BONE, r2s) => r2s
+ − 113
case (r1s, r2s) => BSEQ(r1s, r2s)
+ − 114
}
+ − 115
case BALTS(bs, rs) => {
+ − 116
//assert(bs.length == rs.length - 1)
+ − 117
val dist_res = {//all S
+ − 118
val rs_simp = rs.map(strong_br_simp)
+ − 119
val flat_res = stflat(rs_simp)
+ − 120
distinctBy(flat_res, berase)
+ − 121
}
+ − 122
dist_res match {
+ − 123
case Nil => {BZERO}
+ − 124
case s :: Nil => { s}
+ − 125
case rs => {BALTS(bs, rs)}
+ − 126
}
+ − 127
}
5
+ − 128
case BSTAR(r) => BSTAR(strong_br_simp(r))
0
+ − 129
case r => r
+ − 130
}
2
+ − 131
//we want to bound the size by a function bspill s.t.
+ − 132
//bspill(ders_simp(r,s)) ⊂ PD(r)
+ − 133
//and bspill is size-preserving (up to a constant factor)
+ − 134
//so we bound |ders_simp(r,s)| by |PD(r)|
+ − 135
//where we already have a nice bound for |PD(r)|: t^2*n^2 in Antimirov's paper
0
+ − 136
2
+ − 137
//the function bspill converts a brexp into a list of rexps
+ − 138
//the conversion mainly about this: (r1+r2)*r3*r4*.....rn -> {r1*r3*r4*....rn, r2*r3*r4*.....rn}
+ − 139
//but this is not always allowed
+ − 140
//sometimes, we just leave the expression as it is:
+ − 141
//eg1: (a+b)*c -> {(a+b)*c}
+ − 142
//eg2: r1+r2 -> {r1+r2} instead of {r1, r2}
+ − 143
//why?
+ − 144
//if we always return {a, b} when we encounter a+b, the property
+ − 145
//bspill(ders_simp(r,s)) ⊂ PD(r)
+ − 146
//does not always hold
+ − 147
//for instance
+ − 148
//if we call the bspill that always returns {r1, r2} when encountering r1+r2 "bspilli"
+ − 149
//then bspilli( ders_simp( (c+cb)+c(a+b), c ) ) == bspilli(1+b+a) = {1, b, a}
+ − 150
//However the letter a does not belong to PD( (c+cb)+c(a+b) )
+ − 151
//then we can no longer say ders_simp(r,s)'s size is bounded by PD(r) because the former contains something the latter does not have
+ − 152
//In order to make sure the inclusion holds, we have to find out why new terms appeared in the bspilli set that don't exist in the PD(r) set
+ − 153
//Why a does not belong to PD((c+cb)+c(a+b))?
+ − 154
//PD(r1+r2) = PD(r1) union PD(r2) => PD((c+cb)+c(a+b)) = PD(c+cb) union PD(c(a+b))
+ − 155
//So the only possibility for PD to include a must be in the latter part of the regex, namely, c(a+b)
+ − 156
//we have lemma that PD(r) = union of pders(s, r) where s is taken over all strings whose length does not exceed depth(r)
+ − 157
//so PD(r) ⊂ pder(empty_string, r) union pder(c, r) union pder(ca, r) union pder(cb, r) where r = c(a+b)
+ − 158
//RHS = {1} union pder(c, c(a+b))
+ − 159
//Observe that pder(c, c(a+b)) == {a+b}
+ − 160
//a and b are together, from the original regular expression (c+cb)+c(a+b).
+ − 161
//but by our simplification, we first flattened this a+b into the same level with 1+b, then
+ − 162
//removed duplicates of b, thereby destroying the structure in a+b and making this term a, instead of a+b
+ − 163
//But in PD(r) we only have a+b, no a
+ − 164
//one ad hoc solution might be to try this bspill(ders_simp(r,s)) ⊂ PD(r) union {all letters}
+ − 165
//But this does not hold either according to experiment.
+ − 166
//So we need to make sure the structure r1+r2 is not simplified away if it is from the original expression
0
+ − 167
2
+ − 168
+ − 169
1
+ − 170
def bspill(r: BRexp): List[Rexp] = {
+ − 171
r match {
2
+ − 172
case BALTS(b, rs) => {
+ − 173
if(b == S)
+ − 174
rs.flatMap(bspill)
+ − 175
else
+ − 176
List(ALTS(rs.map(berase)))
0
+ − 177
}
+ − 178
case BSEQ(r2, r3) => bspill(r2).map( a => if(a == ONE) berase(r3) else SEQ(a, berase(r3)) )
+ − 179
case BZERO => List()
+ − 180
case r => List(berase(r))
+ − 181
}
+ − 182
+ − 183
}
+ − 184
+ − 185
}