Brexp.scala
author Chengsong
Thu, 09 Jan 2020 22:20:09 +0000
changeset 93 d486c12deeab
parent 5 622ddbb1223a
permissions -rwxr-xr-x
h
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
0
Chengsong
parents:
diff changeset
     1
import RexpRelated._
Chengsong
parents:
diff changeset
     2
import RexpRelated.Rexp._
Chengsong
parents:
diff changeset
     3
import Spiral._
Chengsong
parents:
diff changeset
     4
import scala.collection.mutable.ArrayBuffer
Chengsong
parents:
diff changeset
     5
abstract class BRexp
Chengsong
parents:
diff changeset
     6
case object BZERO extends BRexp
Chengsong
parents:
diff changeset
     7
case object BONE extends BRexp
Chengsong
parents:
diff changeset
     8
case class BCHAR(c: Char) extends BRexp
Chengsong
parents:
diff changeset
     9
case class BALTS(b: Bit, rs: List[BRexp]) extends BRexp 
Chengsong
parents:
diff changeset
    10
case class BSEQ(r1: BRexp, r2: BRexp) extends BRexp 
Chengsong
parents:
diff changeset
    11
case class BSTAR(r: BRexp) extends BRexp 
Chengsong
parents:
diff changeset
    12
Chengsong
parents:
diff changeset
    13
object BRexp{
Chengsong
parents:
diff changeset
    14
  def brternalise(r: Rexp) : BRexp = r match {//remember the initial shadowed bit should be set to Z as there 
Chengsong
parents:
diff changeset
    15
  //are no enclosing STAR or SEQ
Chengsong
parents:
diff changeset
    16
    case ZERO => BZERO
Chengsong
parents:
diff changeset
    17
    case ONE => BONE
Chengsong
parents:
diff changeset
    18
    case CHAR(c) => BCHAR(c)
Chengsong
parents:
diff changeset
    19
    case ALTS(rs) => BALTS(Z,  rs.map(brternalise))
Chengsong
parents:
diff changeset
    20
    case SEQ(r1, r2) => BSEQ( brternalise(r1), brternalise(r2) )
Chengsong
parents:
diff changeset
    21
    case STAR(r) => BSTAR(brternalise(r))
Chengsong
parents:
diff changeset
    22
    case RECD(x, r) => brternalise(r)
Chengsong
parents:
diff changeset
    23
  }
Chengsong
parents:
diff changeset
    24
  def brnullable (r: BRexp) : Boolean = r match {
Chengsong
parents:
diff changeset
    25
    case BZERO => false
Chengsong
parents:
diff changeset
    26
    case BONE => true
Chengsong
parents:
diff changeset
    27
    case BCHAR(_) => false
Chengsong
parents:
diff changeset
    28
    case BALTS(bs, rs) => rs.exists(brnullable)
Chengsong
parents:
diff changeset
    29
    case BSEQ(r1, r2) => brnullable(r1) && brnullable(r2)
Chengsong
parents:
diff changeset
    30
    case BSTAR(_) => true
Chengsong
parents:
diff changeset
    31
  }
1
Chengsong
parents: 0
diff changeset
    32
  //this function tells bspill when converting a brexp into a list of rexps
Chengsong
parents: 0
diff changeset
    33
  //the conversion : (a+b)*c -> {a*c, b*c} can only happen when a+b is generated from scratch (e.g. when deriving a seq)
Chengsong
parents: 0
diff changeset
    34
  //or is from the original regex but have been "touched" (i.e. have been derived)
Chengsong
parents: 0
diff changeset
    35
  //Why make this distinction? Look at the following example:
Chengsong
parents: 0
diff changeset
    36
  //r = (c+cb)+c(a+b)
Chengsong
parents: 0
diff changeset
    37
  // r\c = (1+b)+(a+b)
Chengsong
parents: 0
diff changeset
    38
  //after simplification
Chengsong
parents: 0
diff changeset
    39
  // (r\c)simp= 1+b+a
Chengsong
parents: 0
diff changeset
    40
  //we lost the structure that tells us 1+b should be grouped together and a grouped as itself
Chengsong
parents: 0
diff changeset
    41
  //however in our brexp simplification, 
Chengsong
parents: 0
diff changeset
    42
  //(r\c)br_simp = 1+b+(a+b)
Chengsong
parents: 0
diff changeset
    43
  //we do not allow the bracket to be opened as it is from the original expression and have not been touched
0
Chengsong
parents:
diff changeset
    44
  def brder(c: Char, r: BRexp) : BRexp = r match {
Chengsong
parents:
diff changeset
    45
    case BZERO => BZERO
Chengsong
parents:
diff changeset
    46
    case BONE => BZERO
Chengsong
parents:
diff changeset
    47
    case BCHAR(d) => if (c == d) BONE else BZERO
Chengsong
parents:
diff changeset
    48
    case BALTS(bs, rs) => BALTS(S, rs.map(brder(c, _)))//After derivative: Splittable in the sense of PD
Chengsong
parents:
diff changeset
    49
    case BSEQ(r1, r2) => 
Chengsong
parents:
diff changeset
    50
      if (brnullable(r1)) BALTS(S, List(BSEQ(brder(c, r1), r2), brder(c, r2) ) )//in r1\c~r2 r2's structure is maintained together with its splittablility bit
Chengsong
parents:
diff changeset
    51
      else BSEQ(brder(c, r1), r2)
Chengsong
parents:
diff changeset
    52
    case BSTAR(r) => BSEQ(brder(c, r), BSTAR(r))
Chengsong
parents:
diff changeset
    53
  }
Chengsong
parents:
diff changeset
    54
  def bflat(rs: List[BRexp]): List[BRexp] = {
Chengsong
parents:
diff changeset
    55
    rs match {
Chengsong
parents:
diff changeset
    56
      case Nil => Nil//TODO: Look at this! bs should have been exhausted one bit earlier.
Chengsong
parents:
diff changeset
    57
      case BZERO :: rs1 => bflat(rs1)
Chengsong
parents:
diff changeset
    58
      case BALTS(S, rs1) :: rs2 => rs1 ::: bflat(rs2)
Chengsong
parents:
diff changeset
    59
      case BALTS(Z, rs1) :: rs2 => BALTS(Z, rs1) :: bflat(rs2)
Chengsong
parents:
diff changeset
    60
      case r1 :: rs2 => r1 :: bflat(rs2)
Chengsong
parents:
diff changeset
    61
    }
Chengsong
parents:
diff changeset
    62
  }
Chengsong
parents:
diff changeset
    63
  def stflat(rs: List[BRexp]): List[BRexp] = {
Chengsong
parents:
diff changeset
    64
    //println("bs: " + bs + "rs: "+ rs + "length of bs, rs" + bs.length + ", " + rs.length)
Chengsong
parents:
diff changeset
    65
    //assert(bs.length == rs.length - 1 || bs.length == rs.length)//bs == Nil, rs == Nil  May add early termination later.
Chengsong
parents:
diff changeset
    66
    rs match {
Chengsong
parents:
diff changeset
    67
      case Nil => Nil//TODO: Look at this! bs should have been exhausted one bit earlier.
Chengsong
parents:
diff changeset
    68
      case BZERO :: rs1 => bflat(rs1)
Chengsong
parents:
diff changeset
    69
      case BALTS(_, rs1) :: rs2 => rs1 ::: bflat(rs2)
Chengsong
parents:
diff changeset
    70
      case r1 :: rs2 => r1 :: bflat(rs2)
Chengsong
parents:
diff changeset
    71
    }
Chengsong
parents:
diff changeset
    72
  }
Chengsong
parents:
diff changeset
    73
  def berase(r:BRexp): Rexp = r match{
Chengsong
parents:
diff changeset
    74
    case BZERO => ZERO
Chengsong
parents:
diff changeset
    75
    case BONE => ONE
Chengsong
parents:
diff changeset
    76
    case BCHAR(f) => CHAR(f)
Chengsong
parents:
diff changeset
    77
    case BALTS(bs, rs) => ALTS(rs.map(berase(_)))
Chengsong
parents:
diff changeset
    78
    case BSEQ(r1, r2) => SEQ (berase(r1), berase(r2))
Chengsong
parents:
diff changeset
    79
    case BSTAR(r)=> STAR(berase(r))
Chengsong
parents:
diff changeset
    80
  }
Chengsong
parents:
diff changeset
    81
  def br_simp(r: BRexp): BRexp = r match {
Chengsong
parents:
diff changeset
    82
    case BSEQ(r1, r2) => (br_simp(r1), br_simp(r2)) match {
Chengsong
parents:
diff changeset
    83
        case (BZERO, _) => BZERO
Chengsong
parents:
diff changeset
    84
        case (_, BZERO) => BZERO
Chengsong
parents:
diff changeset
    85
        case (BONE, r2s) => r2s 
Chengsong
parents:
diff changeset
    86
        case (r1s, r2s) => BSEQ(r1s, r2s)
Chengsong
parents:
diff changeset
    87
    }
Chengsong
parents:
diff changeset
    88
    case BALTS(bs, rs) => {
Chengsong
parents:
diff changeset
    89
      //assert(bs.length == rs.length - 1)
Chengsong
parents:
diff changeset
    90
      val dist_res = if(bs == S) {//all S
Chengsong
parents:
diff changeset
    91
        val rs_simp = rs.map(br_simp)
Chengsong
parents:
diff changeset
    92
        val flat_res = bflat(rs_simp)
Chengsong
parents:
diff changeset
    93
        distinctBy(flat_res, berase)
Chengsong
parents:
diff changeset
    94
      }
Chengsong
parents:
diff changeset
    95
      else{//not allowed to simplify (all Z)
Chengsong
parents:
diff changeset
    96
        rs
Chengsong
parents:
diff changeset
    97
      }
Chengsong
parents:
diff changeset
    98
      dist_res match {
Chengsong
parents:
diff changeset
    99
        case Nil => {BZERO}
Chengsong
parents:
diff changeset
   100
        case s :: Nil => { s}
Chengsong
parents:
diff changeset
   101
        case rs => {BALTS(bs, rs)}
Chengsong
parents:
diff changeset
   102
      }
Chengsong
parents:
diff changeset
   103
    }
Chengsong
parents:
diff changeset
   104
    //case BSTAR(r) => BSTAR(r)
Chengsong
parents:
diff changeset
   105
    case r => r
Chengsong
parents:
diff changeset
   106
  }
Chengsong
parents:
diff changeset
   107
Chengsong
parents:
diff changeset
   108
  def strong_br_simp(r: BRexp): BRexp = r match {
Chengsong
parents:
diff changeset
   109
    case BSEQ(r1, r2) => (strong_br_simp(r1), strong_br_simp(r2)) match {
Chengsong
parents:
diff changeset
   110
        case (BZERO, _) => BZERO
Chengsong
parents:
diff changeset
   111
        case (_, BZERO) => BZERO
Chengsong
parents:
diff changeset
   112
        case (BONE, r2s) => r2s 
Chengsong
parents:
diff changeset
   113
        case (r1s, r2s) => BSEQ(r1s, r2s)
Chengsong
parents:
diff changeset
   114
    }
Chengsong
parents:
diff changeset
   115
    case BALTS(bs, rs) => {
Chengsong
parents:
diff changeset
   116
      //assert(bs.length == rs.length - 1)
Chengsong
parents:
diff changeset
   117
      val dist_res = {//all S
Chengsong
parents:
diff changeset
   118
        val rs_simp = rs.map(strong_br_simp)
Chengsong
parents:
diff changeset
   119
        val flat_res = stflat(rs_simp)
Chengsong
parents:
diff changeset
   120
        distinctBy(flat_res, berase)
Chengsong
parents:
diff changeset
   121
      }
Chengsong
parents:
diff changeset
   122
      dist_res match {
Chengsong
parents:
diff changeset
   123
        case Nil => {BZERO}
Chengsong
parents:
diff changeset
   124
        case s :: Nil => { s}
Chengsong
parents:
diff changeset
   125
        case rs => {BALTS(bs, rs)}
Chengsong
parents:
diff changeset
   126
      }
Chengsong
parents:
diff changeset
   127
    }
5
622ddbb1223a correctness test with enumeration
Chengsong
parents: 2
diff changeset
   128
    case BSTAR(r) => BSTAR(strong_br_simp(r))
0
Chengsong
parents:
diff changeset
   129
    case r => r
Chengsong
parents:
diff changeset
   130
  }
2
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   131
  //we want to bound the size by a function bspill s.t.
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   132
  //bspill(ders_simp(r,s)) ⊂  PD(r) 
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   133
  //and bspill is size-preserving (up to a constant factor)
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   134
  //so we bound |ders_simp(r,s)| by |PD(r)|
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   135
  //where we already have a nice bound for |PD(r)|: t^2*n^2 in Antimirov's paper
0
Chengsong
parents:
diff changeset
   136
2
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   137
  //the function bspill converts a brexp into a list of rexps
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   138
  //the conversion mainly about this: (r1+r2)*r3*r4*.....rn -> {r1*r3*r4*....rn, r2*r3*r4*.....rn} 
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   139
  //but this is not always allowed
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   140
  //sometimes, we just leave the expression as it is: 
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   141
  //eg1: (a+b)*c -> {(a+b)*c}
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   142
  //eg2: r1+r2 -> {r1+r2} instead of {r1, r2}
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   143
  //why?
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   144
  //if we always return {a, b} when we encounter a+b, the property
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   145
  //bspill(ders_simp(r,s)) ⊂  PD(r) 
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   146
  //does not always hold
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   147
  //for instance 
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   148
  //if we call the bspill that always returns {r1, r2} when encountering r1+r2 "bspilli"
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   149
  //then bspilli( ders_simp( (c+cb)+c(a+b), c ) ) == bspilli(1+b+a) = {1, b, a}
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   150
  //However the letter a does not belong to PD( (c+cb)+c(a+b) )
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   151
  //then we can no longer say ders_simp(r,s)'s size is bounded by PD(r) because the former contains something the latter does not have
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   152
  //In order to make sure the inclusion holds, we have to find out why new terms appeared in the bspilli set that don't exist in the PD(r) set
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   153
  //Why a does not belong to PD((c+cb)+c(a+b))?
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   154
  //PD(r1+r2) = PD(r1) union PD(r2) => PD((c+cb)+c(a+b)) = PD(c+cb) union PD(c(a+b))
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   155
  //So the only possibility for PD to include a must be in the latter part of the regex, namely, c(a+b)
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   156
  //we have lemma that PD(r) = union of pders(s, r) where s is taken over all strings whose length does not exceed depth(r)
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   157
  //so PD(r) ⊂ pder(empty_string, r) union pder(c, r) union pder(ca, r) union pder(cb, r) where r = c(a+b)
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   158
  //RHS = {1} union pder(c, c(a+b))
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   159
  //Observe that pder(c, c(a+b)) == {a+b}
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   160
  //a and b are together, from the original regular expression (c+cb)+c(a+b).
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   161
  //but by our simplification, we first flattened this a+b into the same level with 1+b, then
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   162
  //removed duplicates of b, thereby destroying the structure in a+b and making this term a, instead of a+b
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   163
  //But in PD(r) we only have a+b, no a
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   164
  //one ad hoc solution might be to try this bspill(ders_simp(r,s)) ⊂  PD(r) union {all letters}
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   165
  //But this does not hold either according to experiment.
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   166
  //So we need to make sure the structure r1+r2 is not simplified away if it is from the original expression
0
Chengsong
parents:
diff changeset
   167
  
2
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   168
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   169
  
1
Chengsong
parents: 0
diff changeset
   170
  def bspill(r: BRexp): List[Rexp] = {
Chengsong
parents: 0
diff changeset
   171
      r match {
2
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   172
          case BALTS(b, rs) => {
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   173
            if(b == S)
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   174
              rs.flatMap(bspill)
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   175
            else
cf169411b771 more comments
Chengsong
parents: 1
diff changeset
   176
              List(ALTS(rs.map(berase)))
0
Chengsong
parents:
diff changeset
   177
          }
Chengsong
parents:
diff changeset
   178
          case BSEQ(r2, r3) => bspill(r2).map( a => if(a == ONE) berase(r3) else SEQ(a, berase(r3)) )
Chengsong
parents:
diff changeset
   179
          case BZERO => List()
Chengsong
parents:
diff changeset
   180
          case r => List(berase(r))
Chengsong
parents:
diff changeset
   181
      }
Chengsong
parents:
diff changeset
   182
    
Chengsong
parents:
diff changeset
   183
  }
Chengsong
parents:
diff changeset
   184
Chengsong
parents:
diff changeset
   185
}