cst_tests: Brexp.scala@cf169411b771 (annotated)

0 902326e1615a run Chengsong parents: diff changeset	1	import RexpRelated._
902326e1615a run Chengsong parents: diff changeset	2	import RexpRelated.Rexp._
902326e1615a run Chengsong parents: diff changeset	3	import Spiral._
902326e1615a run Chengsong parents: diff changeset	4	import scala.collection.mutable.ArrayBuffer
902326e1615a run Chengsong parents: diff changeset	5	abstract class BRexp
902326e1615a run Chengsong parents: diff changeset	6	case object BZERO extends BRexp
902326e1615a run Chengsong parents: diff changeset	7	case object BONE extends BRexp
902326e1615a run Chengsong parents: diff changeset	8	case class BCHAR(c: Char) extends BRexp
902326e1615a run Chengsong parents: diff changeset	9	case class BALTS(b: Bit, rs: List[BRexp]) extends BRexp
902326e1615a run Chengsong parents: diff changeset	10	case class BSEQ(r1: BRexp, r2: BRexp) extends BRexp
902326e1615a run Chengsong parents: diff changeset	11	case class BSTAR(r: BRexp) extends BRexp
902326e1615a run Chengsong parents: diff changeset	12
902326e1615a run Chengsong parents: diff changeset	13	object BRexp{
902326e1615a run Chengsong parents: diff changeset	14	def brternalise(r: Rexp) : BRexp = r match {//remember the initial shadowed bit should be set to Z as there
902326e1615a run Chengsong parents: diff changeset	15	//are no enclosing STAR or SEQ
902326e1615a run Chengsong parents: diff changeset	16	case ZERO => BZERO
902326e1615a run Chengsong parents: diff changeset	17	case ONE => BONE
902326e1615a run Chengsong parents: diff changeset	18	case CHAR(c) => BCHAR(c)
902326e1615a run Chengsong parents: diff changeset	19	case ALTS(rs) => BALTS(Z, rs.map(brternalise))
902326e1615a run Chengsong parents: diff changeset	20	case SEQ(r1, r2) => BSEQ( brternalise(r1), brternalise(r2) )
902326e1615a run Chengsong parents: diff changeset	21	case STAR(r) => BSTAR(brternalise(r))
902326e1615a run Chengsong parents: diff changeset	22	case RECD(x, r) => brternalise(r)
902326e1615a run Chengsong parents: diff changeset	23	}
902326e1615a run Chengsong parents: diff changeset	24	def brnullable (r: BRexp) : Boolean = r match {
902326e1615a run Chengsong parents: diff changeset	25	case BZERO => false
902326e1615a run Chengsong parents: diff changeset	26	case BONE => true
902326e1615a run Chengsong parents: diff changeset	27	case BCHAR(_) => false
902326e1615a run Chengsong parents: diff changeset	28	case BALTS(bs, rs) => rs.exists(brnullable)
902326e1615a run Chengsong parents: diff changeset	29	case BSEQ(r1, r2) => brnullable(r1) && brnullable(r2)
902326e1615a run Chengsong parents: diff changeset	30	case BSTAR(_) => true
902326e1615a run Chengsong parents: diff changeset	31	}
1 99f4459d9bb6 i Chengsong parents: 0 diff changeset	32	//this function tells bspill when converting a brexp into a list of rexps
99f4459d9bb6 i Chengsong parents: 0 diff changeset	33	//the conversion : (a+b)c -> {ac, b*c} can only happen when a+b is generated from scratch (e.g. when deriving a seq)
99f4459d9bb6 i Chengsong parents: 0 diff changeset	34	//or is from the original regex but have been "touched" (i.e. have been derived)
99f4459d9bb6 i Chengsong parents: 0 diff changeset	35	//Why make this distinction? Look at the following example:
99f4459d9bb6 i Chengsong parents: 0 diff changeset	36	//r = (c+cb)+c(a+b)
99f4459d9bb6 i Chengsong parents: 0 diff changeset	37	// r\c = (1+b)+(a+b)
99f4459d9bb6 i Chengsong parents: 0 diff changeset	38	//after simplification
99f4459d9bb6 i Chengsong parents: 0 diff changeset	39	// (r\c)simp= 1+b+a
99f4459d9bb6 i Chengsong parents: 0 diff changeset	40	//we lost the structure that tells us 1+b should be grouped together and a grouped as itself
99f4459d9bb6 i Chengsong parents: 0 diff changeset	41	//however in our brexp simplification,
99f4459d9bb6 i Chengsong parents: 0 diff changeset	42	//(r\c)br_simp = 1+b+(a+b)
99f4459d9bb6 i Chengsong parents: 0 diff changeset	43	//we do not allow the bracket to be opened as it is from the original expression and have not been touched
0 902326e1615a run Chengsong parents: diff changeset	44	def brder(c: Char, r: BRexp) : BRexp = r match {
902326e1615a run Chengsong parents: diff changeset	45	case BZERO => BZERO
902326e1615a run Chengsong parents: diff changeset	46	case BONE => BZERO
902326e1615a run Chengsong parents: diff changeset	47	case BCHAR(d) => if (c == d) BONE else BZERO
902326e1615a run Chengsong parents: diff changeset	48	case BALTS(bs, rs) => BALTS(S, rs.map(brder(c, _)))//After derivative: Splittable in the sense of PD
902326e1615a run Chengsong parents: diff changeset	49	case BSEQ(r1, r2) =>
902326e1615a run Chengsong parents: diff changeset	50	if (brnullable(r1)) BALTS(S, List(BSEQ(brder(c, r1), r2), brder(c, r2) ) )//in r1\c~r2 r2's structure is maintained together with its splittablility bit
902326e1615a run Chengsong parents: diff changeset	51	else BSEQ(brder(c, r1), r2)
902326e1615a run Chengsong parents: diff changeset	52	case BSTAR(r) => BSEQ(brder(c, r), BSTAR(r))
902326e1615a run Chengsong parents: diff changeset	53	}
902326e1615a run Chengsong parents: diff changeset	54	def bflat(rs: List[BRexp]): List[BRexp] = {
902326e1615a run Chengsong parents: diff changeset	55	rs match {
902326e1615a run Chengsong parents: diff changeset	56	case Nil => Nil//TODO: Look at this! bs should have been exhausted one bit earlier.
902326e1615a run Chengsong parents: diff changeset	57	case BZERO :: rs1 => bflat(rs1)
902326e1615a run Chengsong parents: diff changeset	58	case BALTS(S, rs1) :: rs2 => rs1 ::: bflat(rs2)
902326e1615a run Chengsong parents: diff changeset	59	case BALTS(Z, rs1) :: rs2 => BALTS(Z, rs1) :: bflat(rs2)
902326e1615a run Chengsong parents: diff changeset	60	case r1 :: rs2 => r1 :: bflat(rs2)
902326e1615a run Chengsong parents: diff changeset	61	}
902326e1615a run Chengsong parents: diff changeset	62	}
902326e1615a run Chengsong parents: diff changeset	63	def stflat(rs: List[BRexp]): List[BRexp] = {
902326e1615a run Chengsong parents: diff changeset	64	//println("bs: " + bs + "rs: "+ rs + "length of bs, rs" + bs.length + ", " + rs.length)
902326e1615a run Chengsong parents: diff changeset	65	//assert(bs.length == rs.length - 1 \|\| bs.length == rs.length)//bs == Nil, rs == Nil May add early termination later.
902326e1615a run Chengsong parents: diff changeset	66	rs match {
902326e1615a run Chengsong parents: diff changeset	67	case Nil => Nil//TODO: Look at this! bs should have been exhausted one bit earlier.
902326e1615a run Chengsong parents: diff changeset	68	case BZERO :: rs1 => bflat(rs1)
902326e1615a run Chengsong parents: diff changeset	69	case BALTS(_, rs1) :: rs2 => rs1 ::: bflat(rs2)
902326e1615a run Chengsong parents: diff changeset	70	case r1 :: rs2 => r1 :: bflat(rs2)
902326e1615a run Chengsong parents: diff changeset	71	}
902326e1615a run Chengsong parents: diff changeset	72	}
902326e1615a run Chengsong parents: diff changeset	73	def berase(r:BRexp): Rexp = r match{
902326e1615a run Chengsong parents: diff changeset	74	case BZERO => ZERO
902326e1615a run Chengsong parents: diff changeset	75	case BONE => ONE
902326e1615a run Chengsong parents: diff changeset	76	case BCHAR(f) => CHAR(f)
902326e1615a run Chengsong parents: diff changeset	77	case BALTS(bs, rs) => ALTS(rs.map(berase(_)))
902326e1615a run Chengsong parents: diff changeset	78	case BSEQ(r1, r2) => SEQ (berase(r1), berase(r2))
902326e1615a run Chengsong parents: diff changeset	79	case BSTAR(r)=> STAR(berase(r))
902326e1615a run Chengsong parents: diff changeset	80	}
902326e1615a run Chengsong parents: diff changeset	81	def br_simp(r: BRexp): BRexp = r match {
902326e1615a run Chengsong parents: diff changeset	82	case BSEQ(r1, r2) => (br_simp(r1), br_simp(r2)) match {
902326e1615a run Chengsong parents: diff changeset	83	case (BZERO, _) => BZERO
902326e1615a run Chengsong parents: diff changeset	84	case (_, BZERO) => BZERO
902326e1615a run Chengsong parents: diff changeset	85	case (BONE, r2s) => r2s
902326e1615a run Chengsong parents: diff changeset	86	case (r1s, r2s) => BSEQ(r1s, r2s)
902326e1615a run Chengsong parents: diff changeset	87	}
902326e1615a run Chengsong parents: diff changeset	88	case BALTS(bs, rs) => {
902326e1615a run Chengsong parents: diff changeset	89	//assert(bs.length == rs.length - 1)
902326e1615a run Chengsong parents: diff changeset	90	val dist_res = if(bs == S) {//all S
902326e1615a run Chengsong parents: diff changeset	91	val rs_simp = rs.map(br_simp)
902326e1615a run Chengsong parents: diff changeset	92	val flat_res = bflat(rs_simp)
902326e1615a run Chengsong parents: diff changeset	93	distinctBy(flat_res, berase)
902326e1615a run Chengsong parents: diff changeset	94	}
902326e1615a run Chengsong parents: diff changeset	95	else{//not allowed to simplify (all Z)
902326e1615a run Chengsong parents: diff changeset	96	rs
902326e1615a run Chengsong parents: diff changeset	97	}
902326e1615a run Chengsong parents: diff changeset	98	dist_res match {
902326e1615a run Chengsong parents: diff changeset	99	case Nil => {BZERO}
902326e1615a run Chengsong parents: diff changeset	100	case s :: Nil => { s}
902326e1615a run Chengsong parents: diff changeset	101	case rs => {BALTS(bs, rs)}
902326e1615a run Chengsong parents: diff changeset	102	}
902326e1615a run Chengsong parents: diff changeset	103	}
902326e1615a run Chengsong parents: diff changeset	104	//case BSTAR(r) => BSTAR(r)
902326e1615a run Chengsong parents: diff changeset	105	case r => r
902326e1615a run Chengsong parents: diff changeset	106	}
902326e1615a run Chengsong parents: diff changeset	107
902326e1615a run Chengsong parents: diff changeset	108	def strong_br_simp(r: BRexp): BRexp = r match {
902326e1615a run Chengsong parents: diff changeset	109	case BSEQ(r1, r2) => (strong_br_simp(r1), strong_br_simp(r2)) match {
902326e1615a run Chengsong parents: diff changeset	110	case (BZERO, _) => BZERO
902326e1615a run Chengsong parents: diff changeset	111	case (_, BZERO) => BZERO
902326e1615a run Chengsong parents: diff changeset	112	case (BONE, r2s) => r2s
902326e1615a run Chengsong parents: diff changeset	113	case (r1s, r2s) => BSEQ(r1s, r2s)
902326e1615a run Chengsong parents: diff changeset	114	}
902326e1615a run Chengsong parents: diff changeset	115	case BALTS(bs, rs) => {
902326e1615a run Chengsong parents: diff changeset	116	//assert(bs.length == rs.length - 1)
902326e1615a run Chengsong parents: diff changeset	117	val dist_res = {//all S
902326e1615a run Chengsong parents: diff changeset	118	val rs_simp = rs.map(strong_br_simp)
902326e1615a run Chengsong parents: diff changeset	119	val flat_res = stflat(rs_simp)
902326e1615a run Chengsong parents: diff changeset	120	distinctBy(flat_res, berase)
902326e1615a run Chengsong parents: diff changeset	121	}
902326e1615a run Chengsong parents: diff changeset	122	dist_res match {
902326e1615a run Chengsong parents: diff changeset	123	case Nil => {BZERO}
902326e1615a run Chengsong parents: diff changeset	124	case s :: Nil => { s}
902326e1615a run Chengsong parents: diff changeset	125	case rs => {BALTS(bs, rs)}
902326e1615a run Chengsong parents: diff changeset	126	}
902326e1615a run Chengsong parents: diff changeset	127	}
902326e1615a run Chengsong parents: diff changeset	128	//case BSTAR(r) => BSTAR(r)
902326e1615a run Chengsong parents: diff changeset	129	case r => r
902326e1615a run Chengsong parents: diff changeset	130	}
2 cf169411b771 more comments Chengsong parents: 1 diff changeset	131	//we want to bound the size by a function bspill s.t.
cf169411b771 more comments Chengsong parents: 1 diff changeset	132	//bspill(ders_simp(r,s)) ⊂ PD(r)
cf169411b771 more comments Chengsong parents: 1 diff changeset	133	//and bspill is size-preserving (up to a constant factor)
cf169411b771 more comments Chengsong parents: 1 diff changeset	134	//so we bound \|ders_simp(r,s)\| by \|PD(r)\|
cf169411b771 more comments Chengsong parents: 1 diff changeset	135	//where we already have a nice bound for \|PD(r)\|: t^2*n^2 in Antimirov's paper
0 902326e1615a run Chengsong parents: diff changeset	136
2 cf169411b771 more comments Chengsong parents: 1 diff changeset	137	//the function bspill converts a brexp into a list of rexps
cf169411b771 more comments Chengsong parents: 1 diff changeset	138	//the conversion mainly about this: (r1+r2)r3r4.....rn -> {r1r3r4....rn, r2r3r4*.....rn}
cf169411b771 more comments Chengsong parents: 1 diff changeset	139	//but this is not always allowed
cf169411b771 more comments Chengsong parents: 1 diff changeset	140	//sometimes, we just leave the expression as it is:
cf169411b771 more comments Chengsong parents: 1 diff changeset	141	//eg1: (a+b)c -> {(a+b)c}
cf169411b771 more comments Chengsong parents: 1 diff changeset	142	//eg2: r1+r2 -> {r1+r2} instead of {r1, r2}
cf169411b771 more comments Chengsong parents: 1 diff changeset	143	//why?
cf169411b771 more comments Chengsong parents: 1 diff changeset	144	//if we always return {a, b} when we encounter a+b, the property
cf169411b771 more comments Chengsong parents: 1 diff changeset	145	//bspill(ders_simp(r,s)) ⊂ PD(r)
cf169411b771 more comments Chengsong parents: 1 diff changeset	146	//does not always hold
cf169411b771 more comments Chengsong parents: 1 diff changeset	147	//for instance
cf169411b771 more comments Chengsong parents: 1 diff changeset	148	//if we call the bspill that always returns {r1, r2} when encountering r1+r2 "bspilli"
cf169411b771 more comments Chengsong parents: 1 diff changeset	149	//then bspilli( ders_simp( (c+cb)+c(a+b), c ) ) == bspilli(1+b+a) = {1, b, a}
cf169411b771 more comments Chengsong parents: 1 diff changeset	150	//However the letter a does not belong to PD( (c+cb)+c(a+b) )
cf169411b771 more comments Chengsong parents: 1 diff changeset	151	//then we can no longer say ders_simp(r,s)'s size is bounded by PD(r) because the former contains something the latter does not have
cf169411b771 more comments Chengsong parents: 1 diff changeset	152	//In order to make sure the inclusion holds, we have to find out why new terms appeared in the bspilli set that don't exist in the PD(r) set
cf169411b771 more comments Chengsong parents: 1 diff changeset	153	//Why a does not belong to PD((c+cb)+c(a+b))?
cf169411b771 more comments Chengsong parents: 1 diff changeset	154	//PD(r1+r2) = PD(r1) union PD(r2) => PD((c+cb)+c(a+b)) = PD(c+cb) union PD(c(a+b))
cf169411b771 more comments Chengsong parents: 1 diff changeset	155	//So the only possibility for PD to include a must be in the latter part of the regex, namely, c(a+b)
cf169411b771 more comments Chengsong parents: 1 diff changeset	156	//we have lemma that PD(r) = union of pders(s, r) where s is taken over all strings whose length does not exceed depth(r)
cf169411b771 more comments Chengsong parents: 1 diff changeset	157	//so PD(r) ⊂ pder(empty_string, r) union pder(c, r) union pder(ca, r) union pder(cb, r) where r = c(a+b)
cf169411b771 more comments Chengsong parents: 1 diff changeset	158	//RHS = {1} union pder(c, c(a+b))
cf169411b771 more comments Chengsong parents: 1 diff changeset	159	//Observe that pder(c, c(a+b)) == {a+b}
cf169411b771 more comments Chengsong parents: 1 diff changeset	160	//a and b are together, from the original regular expression (c+cb)+c(a+b).
cf169411b771 more comments Chengsong parents: 1 diff changeset	161	//but by our simplification, we first flattened this a+b into the same level with 1+b, then
cf169411b771 more comments Chengsong parents: 1 diff changeset	162	//removed duplicates of b, thereby destroying the structure in a+b and making this term a, instead of a+b
cf169411b771 more comments Chengsong parents: 1 diff changeset	163	//But in PD(r) we only have a+b, no a
cf169411b771 more comments Chengsong parents: 1 diff changeset	164	//one ad hoc solution might be to try this bspill(ders_simp(r,s)) ⊂ PD(r) union {all letters}
cf169411b771 more comments Chengsong parents: 1 diff changeset	165	//But this does not hold either according to experiment.
cf169411b771 more comments Chengsong parents: 1 diff changeset	166	//So we need to make sure the structure r1+r2 is not simplified away if it is from the original expression
0 902326e1615a run Chengsong parents: diff changeset	167
2 cf169411b771 more comments Chengsong parents: 1 diff changeset	168
cf169411b771 more comments Chengsong parents: 1 diff changeset	169
1 99f4459d9bb6 i Chengsong parents: 0 diff changeset	170	def bspill(r: BRexp): List[Rexp] = {
99f4459d9bb6 i Chengsong parents: 0 diff changeset	171	r match {
2 cf169411b771 more comments Chengsong parents: 1 diff changeset	172	case BALTS(b, rs) => {
cf169411b771 more comments Chengsong parents: 1 diff changeset	173	if(b == S)
cf169411b771 more comments Chengsong parents: 1 diff changeset	174	rs.flatMap(bspill)
cf169411b771 more comments Chengsong parents: 1 diff changeset	175	else
cf169411b771 more comments Chengsong parents: 1 diff changeset	176	List(ALTS(rs.map(berase)))
0 902326e1615a run Chengsong parents: diff changeset	177	}
902326e1615a run Chengsong parents: diff changeset	178	case BSEQ(r2, r3) => bspill(r2).map( a => if(a == ONE) berase(r3) else SEQ(a, berase(r3)) )
902326e1615a run Chengsong parents: diff changeset	179	case BZERO => List()
902326e1615a run Chengsong parents: diff changeset	180	case r => List(berase(r))
902326e1615a run Chengsong parents: diff changeset	181	}
902326e1615a run Chengsong parents: diff changeset	182
902326e1615a run Chengsong parents: diff changeset	183	}
902326e1615a run Chengsong parents: diff changeset	184
902326e1615a run Chengsong parents: diff changeset	185	}

author	Chengsong
	Wed, 13 Mar 2019 15:27:09 +0000
changeset 2	cf169411b771
parent 1	99f4459d9bb6
child 5	622ddbb1223a
permissions	-rwxr-xr-x