160 \\We omit the subscript when it is clear from context what that character is and write $d(r)$ instead of $d_{c}(r)$. |
166 \\We omit the subscript when it is clear from context what that character is and write $d(r)$ instead of $d_{c}(r)$. |
161 \\And we omit the parentheses when no confusion can be caused. For example ders\_simp(c, r) can be written as $s(d_{c}(r))$ or even $s d r$ as we know the derivative operation is w.r.t the character c. Here the s and d are more like operators that take an annotated regular expression as an input and return an annotated regular expression as an output |
167 \\And we omit the parentheses when no confusion can be caused. For example ders\_simp(c, r) can be written as $s(d_{c}(r))$ or even $s d r$ as we know the derivative operation is w.r.t the character c. Here the s and d are more like operators that take an annotated regular expression as an input and return an annotated regular expression as an output |
162 |
168 |
163 \end{definition} |
169 \end{definition} |
164 |
170 |
165 \begin{definition}{mkepsBC invariant manipulation of bits and notation}\\ |
171 |
166 ALTS(bs, ALTS(bs1, rs1), ALTS(bs2, rs2)) $\sim_{m\epsilon}$ ALTS(bs, rs1.map(fuse(bs1, \_)) ++ rs2.map(fuse(bs2, \_)) ). \\ |
|
167 We also use $bs2>>rs2 $ as a shorthand notation for rs2.map(fuse(bs2,\_)). |
|
168 |
|
169 \end{definition} |
|
170 |
172 |
171 \begin{definition}{distinctBy operation expressed in a different way--how it transforms the list}\\ |
173 \begin{definition}{distinctBy operation expressed in a different way--how it transforms the list}\\ |
172 Given two lists rs1 and rs2, we define the operation $--$:\\ |
174 Given two lists rs1 and rs2, we define the operation $--$:\\ |
173 $rs1 -- rs2 := [r \in rs1 | r \notin rs2]$ |
175 $rs1 -- rs2 := [r \in rs1 | r \notin rs2]$ |
174 Note that the order is preserved as in the original list. |
176 Note that the order each term appears in $rs_1 -- rs_2$ is preserved as in the original list. |
175 \end{definition} |
177 \end{definition} |
176 |
178 |
177 |
179 |
178 \section{Main Result} |
180 \section{Main Result} |
179 \begin{lemma}{simplification function does not simplify an already simplified regex}\\ |
181 |
|
182 \begin{lemma}{simplification function does not simplify an already simplified regex}\label{lma1}\\ |
180 bsimp(r) == bsimp(bsimp(r)) holds for any annotated regular expression r. |
183 bsimp(r) == bsimp(bsimp(r)) holds for any annotated regular expression r. |
181 \end{lemma} |
184 \end{lemma} |
182 |
185 |
183 \begin{lemma}{simp and mkeps}\\ |
186 \begin{lemma}{simp and mkeps}\label{lma2}\\ |
184 When r is nullable, we have that |
187 When r is nullable, we have that |
185 mkeps(bsimp(r)) == mkeps(r) |
188 mkeps(bsimp(r)) == mkeps(r) |
186 \end{lemma} |
189 \end{lemma} |
187 |
190 |
188 \begin{lemma}{mkeps equivalence w.r.t some syntactically different regular expressions(1 ALTS)}\\ |
191 |
189 When one of the 2 regular expressions $s(r_1)$ and $s(r_2)$ is ALTS(bs1, rs1), we have that $ds(ALTS(bs, r1, r2)) \sim_{m\epsilon} d(ALTS(bs, sr_1, sr_2))$ |
192 %\begin{theorem}See~\cref{lma1}.\end{theorem} |
190 \end{lemma} |
193 %\begin{lemma}\label{lma1}\lipsum[2]\end{lemma} |
191 \begin{proof} |
194 |
192 By opening up one of the alts and show no additional changes are made. |
195 \begin{lemma}{mkeps equivalence w.r.t some syntactically different regular expressions(1 ALTS)}\label{lma3}\\ |
193 \end{proof} |
196 When one of the 2 regular expressions $s(r_1)$ and $s(r_2)$ is of the form ALTS(bs1, rs1), we have that $ds(ALTS(bs, r1, r2)) \sim_{m\epsilon} d(ALTS(bs, sr_1, sr_2))$ |
194 |
197 \end{lemma} |
195 \begin{lemma}{mkepsBC equivalence w.r.t syntactically different regular expressions(2 ALTS)}\\ |
198 \begin{proof} |
|
199 By opening up one of the alts and show no additional changes are made.\\ |
|
200 Details: $ds(ALTS(bs, r1, r2)) = d Co( bs, dB(flats(sr1, sr2)) )$ |
|
201 \end{proof} |
|
202 |
|
203 |
|
204 \begin{lemma}{mkepsBC invariant manipulation of bits and notation}\label{lma7}\\ |
|
205 ALTS(bs, ALTS(bs1, rs1), ALTS(bs2, rs2)) $\sim_{m\epsilon}$ ALTS(bs, rs1.map(fuse(bs1, \_)) ++ rs2.map(fuse(bs2, \_)) ). \\ |
|
206 We also use $bs2>>rs2 $ as a shorthand notation for rs2.map(fuse(bs2,\_)). |
|
207 \end{lemma} |
|
208 |
|
209 \begin{lemma}{mkepsBC equivalence w.r.t syntactically different regular expressions(2 ALTS)}\label{lma4}\\ |
196 $sr_1 = ALTS(bs1, rs1)$ and $sr_2 = ALTS(bs2, rs2)$ we have $ d(sr_1 +sr_2 ) \sim_{m\epsilon} d(ALTS(bs, bs1>>rs1 ++ bs2>>rs2))$ |
210 $sr_1 = ALTS(bs1, rs1)$ and $sr_2 = ALTS(bs2, rs2)$ we have $ d(sr_1 +sr_2 ) \sim_{m\epsilon} d(ALTS(bs, bs1>>rs1 ++ bs2>>rs2))$ |
197 \end{lemma} |
211 \end{lemma} |
198 \begin{proof} |
212 \begin{proof} |
199 We are just fusing bits inside here, there is no other structural change. |
213 We are just fusing bits inside here, there is no other structural change. |
200 \end{proof} |
214 \end{proof} |
201 |
215 |
202 \begin{lemma}{mkepsBC equivalence w.r.t syntactically different regular expressions(2 ALTS+ some deletion)}\\ |
216 \begin{lemma}{What does dB do to two already simplified ALTS}\label{lma5}\\ |
203 $d Co(ALTS(bs, dB(bs1>>rs1 ++ bs2>>rs2))) \sim_{m\epsilon} d Co(ALTS(bs, dB(bs1>>rs1 ++ ((bs2>>rs2)--rs1) ))) $ |
217 $d Co(ALTS(bs, dB(bs1>>rs1 ++ bs2>>rs2))) = d Co(ALTS(bs, bs1>>rs1 ++ ((bs2>>rs2)--rs1) )) $ |
204 \end{lemma} |
218 \end{lemma} |
205 \begin{proof} |
219 \begin{proof} |
206 The removed parts have already appeared before in $rs_1$, so if any of them is truly nullable and is chosen as the mkeps path, it will have been traversed through in its previous counterpart.\\ |
220 We prove that $ dB(bs1>>rs1 ++ bs2>>rs2) = bs1>>rs1 ++ ((bs2>>rs2)--rs1)$. |
207 (We probably need to switch the position of lemma5 and lemma6) |
221 |
208 \end{proof} |
222 |
209 |
223 \end{proof} |
210 \begin{lemma}{after opening two previously simplified alts up into terms, length must exceed 2}\\ |
224 |
211 $d Co(ALTS(bs, rs )) \sim_{m\epsilon} d(ALTS(bs, rs))$ if $rs$ is a list of length greater than or equal to 2. |
225 \begin{lemma}{after opening two previously simplified alts up into terms, length must exceed 2}\label{lma6}\\ |
212 \end{lemma} |
226 If sr1, sr2 are of the form ALTS(bs1, rs1), ALTS(bs2, rs2) respectively, then we have that |
213 \begin{proof} |
227 $Co(bs, (bs1>>rs1) ++ (bs2>>rs2)--rs1) = ALTS(bs, bs1>>rs1 ++ (bs2>>rs2)--rs1)$ |
214 As suggested by the title of this lemma |
228 \end{lemma} |
215 \end{proof} |
229 \begin{proof} |
216 |
230 $Co(bs, rs) \sim_{m\epsilon} ALTS(bs, rs)$ if $rs$ is a list of length greater than or equal to 2. |
217 |
231 As suggested by the title of this lemma, ALTS(bs1, rs1) is a result of simplification, which means that rs1 must be composed of at least 2 distinct regular terms. This alone says that $bs1>>rs1 ++ (bs2>>rs2)--rs1$ is a list of length greater than or equal to 2, as the second operand of the concatenation operator $(bs2>>rs2) -- rs1$ can only contribute a non-negative value to the overall length of the list |
218 |
232 $bs1>>rs1 ++ (bs2>>rs2)--rs1$. |
|
233 \end{proof} |
|
234 |
|
235 |
|
236 \begin{lemma}{mkepsBC equivalence w.r.t syntactically different regular expressions(2 ALTS+ some deletion after derivatives)}\label{lma8}\\ |
|
237 $d ALTS(bs, bs1>>rs1 ++ bs2>>rs2) \sim_{m\epsilon} d ALTS(bs, bs1>>rs1 ++ ((bs2>>rs2)--rs1) ) $ |
|
238 \end{lemma} |
|
239 \begin{proof} |
|
240 Let's call $bs1>>rs1$ $rs1'$ and $bs2>>rs2$ $rs2'$. Then we need to prove $d ALTS(bs, rs1' ++rs2') \sim_{m\epsilon} d ALTS(bs, rs1'++(rs2' -- rs1') ) $.\\ |
|
241 We might as well omit the prime in each rs for simplicty of notation and prove $d ALTS(bs, rs1++rs2) \sim_{m\epsilon} d ALTS(bs, rs1++(rs2 -- rs1) ) $.\\ |
|
242 We know that the result of derivative is nullable, so there must exist an r in rs1++rs2 s.t. r is nullable.\\ |
|
243 If $r \in rs1$, then equivalence holds. If $r \in rs2 \wedge r \notin rs1$, equivalence holds as well. This completes the proof. |
|
244 \end{proof} |
219 |
245 |
220 |
246 |
221 |
247 |
222 \begin{theorem}{Correctness Result} |
248 \begin{theorem}{Correctness Result} |
223 |
249 |
234 |
260 |
235 \begin{proof}{Split into 2 parts.} |
261 \begin{proof}{Split into 2 parts.} |
236 \begin{itemize} |
262 \begin{itemize} |
237 \item |
263 \item |
238 |
264 |
|
265 |
239 When we have an annotated regular expression ar and a string s that matches ar, by the correctness of the algorithm ders, we have that ders(ar, s) is nullable, and that mkepsBC will extract the desired bits for decoding the correct value v for the matching, and v is a POSIX value. Now we prove that mkepsBC(ders\_simp(ar, s)) yields the same bitsequence. |
266 When we have an annotated regular expression ar and a string s that matches ar, by the correctness of the algorithm ders, we have that ders(ar, s) is nullable, and that mkepsBC will extract the desired bits for decoding the correct value v for the matching, and v is a POSIX value. Now we prove that mkepsBC(ders\_simp(ar, s)) yields the same bitsequence. |
240 \\ |
267 \\ |
241 We first open up the ders\_simp function into nested alternating sequences of ders and simp. |
268 We first open up the ders\_simp function into nested alternating sequences of ders and simp. |
242 Assume that s = $c_1...c_n$($n \geq 1$ ) where each of the $c_i$ are characters. |
269 Assume that s = $c_1...c_n$($n \geq 1$ ) where each of the $c_i$ are characters. |
243 Then $ders\_simp(ar, s)$ = $s(d_{c_n}(...s(d_{c_1}(r))...))$ = $sdsd......sdr$. If we can prove that |
270 Then $ders\_simp(ar, s)$ = $s(d_{c_n}(...s(d_{c_1}(r))...))$ = $sdsd......sdr$. If we can prove that |
244 $sdr \sim_{m\epsilon} dsr$ holds for any regular expression and any character, then we are done. This is because then we can push ders operation inside and move simp operation outside and have that $sdsd...sdr \sim_{m\epsilon} ssddsdsd...sdr \sim_{m\epsilon} ... \sim_{m\epsilon} s....sd....dr$ and using lemma1 we have that $s...sd....dr = sd...dr$. By lemma2, we have $RHS \sim_{m\epsilon} d...dr$.\\ |
271 $sdr \sim_{m\epsilon} dsr$ holds for any regular expression and any character, then we are done. This is because then we can push ders operation inside and move simp operation outside and have that $sdsd...sdr \sim_{m\epsilon} ssddsdsd...sdr \sim_{m\epsilon} ... \sim_{m\epsilon} s....sd....dr$. |
245 Now we proceed to prove that $sdr \sim_{m\epsilon} dsr$. This can be reduced to proving $dr \sim_{m\epsilon} dsr$ as we know that $dr \sim_{m\epsilon} sdr$ by lemma2.\\ |
272 Using \autoref{lma1} we have that $s...sd....dr = sd...dr$. |
|
273 By \autoref{lma2}, we have $RHS \sim_{m\epsilon} d...dr$.\\ |
|
274 Notice that we don't actually need \autoref{lma1} here. That is because by \autoref{lma2}, we can have that $s...sd....dr \sim_{m\epsilon} sd...dr$. The equality above can be replaced by mkepsBC |
|
275 equivalence without affecting the validity of the whole proof since all we want is mkepsBC equivalence, not equality. |
|
276 |
|
277 Now we proceed to prove that $sdr \sim_{m\epsilon} dsr$. This can be reduced to proving $dr \sim_{m\epsilon} dsr$ as we know that $dr \sim_{m\epsilon} sdr$ by \autoref{lma2}.\\ |
246 |
278 |
247 we use an induction proof. Base cases are omitted. Here are the 3 inductive cases. |
279 we use an induction proof. Base cases are omitted. Here are the 3 inductive cases. |
248 \begin{itemize} |
280 \begin{itemize} |
249 |
281 |
250 \item{$r_1+r_2$} |
282 \item{$r_1+r_2$} |
251 $r_1+r_2$\\ |
283 $r_1+r_2$\\ |
252 The most difficult case is when $sr1$ and $sr2$ are both ALTS, so that they will be opened up in the flats function and some terms in sr2 might be deleted. Or otherwise we can use the argument that $d(r_1+r_2) = dr_1 + dr_2 \sim_{m\epsilon} dsr_1 + dsr_2 \sim_{m\epsilon} ds(r_1+r_2)$, |
284 The most difficult case is when $sr1$ and $sr2$ are both ALTS, so that they will be opened up in the flats function and some terms in sr2 might be deleted. Or otherwise we can use the argument that $d(r_1+r_2) = dr_1 + dr_2 \sim_{m\epsilon} dsr_1 + dsr_2 \sim_{m\epsilon} ds(r_1+r_2)$, |
253 the last equivalence being established by lemma3. |
285 the last equivalence being established by \autoref{lma3}. |
254 When $s(r_1), s(r_2)$ are both ALTS, we have to be more careful for the last equivalence step, namelly, $dsr_1 + dsr_2 \sim_{m\epsilon} ds(r_1+r_2)$. \\ |
286 When $s(r_1), s(r_2)$ are both ALTS, we have to be more careful for the last equivalence step, namelly, $dsr_1 + dsr_2 \sim_{m\epsilon} ds(r_1+r_2)$. \\ |
255 We have that $LHS = dsr_1 + dsr_2 = d(sr_1 +sr_2)$. Since $sr_1 = ALTS(bs1, rs1)$ and $sr_2 = ALTS(bs2, rs2)$ we have $ d(sr_1 +sr_2 ) \sim_{m\epsilon} d(ALTS(bs, bs1>>rs1 ++ bs2>>rs2))$ by lemma4. |
287 We have that $LHS = dsr_1 + dsr_2 = d(sr_1 +sr_2)$. Since $sr_1 = ALTS(bs1, rs1)$ and $sr_2 = ALTS(bs2, rs2)$ we have $ d(sr_1 +sr_2 ) \sim_{m\epsilon} d(ALTS(bs, bs1>>rs1 ++ bs2>>rs2))$ by \autoref{lma4}. |
256 On the other hand, $RHS = ds(ALTS(bs, r1, r2)) \sim_{m\epsilon}d Co(ALTS(bs, dB(flats(s(r1), s(r2))))) == d Co(ALTS(bs, dB(bs1>>rs1 ++ bs2>>rs2)))$ by definition of bsimp and flats.\\ $d Co(ALTS(bs, dB(bs1>>rs1 ++ bs2>>rs2))) \sim_{m\epsilon} d Co(ALTS(bs, (bs1>>rs1 ++ ((bs2>>rs2)--rs1) ))) $ by lemma5.\\ $d Co(ALTS(bs, (bs1>>rs1 ++ ((bs2>>rs2)--rs1) ))) \sim_{m\epsilon} d(ALTS(bs, bs1>>rs1 ++ (bs2>>rs2)--rs1))$ by lemma6. \\ |
288 On the other hand, $RHS = ds(ALTS(bs, r1, r2)) = d Co(bs, dB(flats(s(r1), s(r2)) ) ) = d Co(bs, dB(bs1>>rs1 ++ bs2>>rs2))$ by definition of bsimp and flats.\\ |
257 Using lemma5 again, we have $d(ALTS(bs, bs1>>rs1 ++ (bs2>>rs2)--rs1)) \sim_{m\epsilon} d(ALTS(bs, bs1>>rs1 ++ bs2>>rs2))$.\\ |
289 $d Co(bs, dB(bs1>>rs1 ++ bs2>>rs2)) = d Co(bs, (bs1>>rs1 ++ ((bs2>>rs2)--rs1))) $ by \autoref{lma5}.\\ $d Co(bs, (bs1>>rs1 ++ ((bs2>>rs2)--rs1) )) = d(ALTS(bs, bs1>>rs1 ++ (bs2>>rs2)--rs1))$ by \autoref{lma6}. \\ |
|
290 Using \autoref{lma8}, we have $d(ALTS(bs, bs1>>rs1 ++ (bs2>>rs2)--rs1)) \sim_{m\epsilon} d(ALTS(bs, bs1>>rs1 ++ bs2>>rs2)) \sim_{m\epsilon} RHS$.\\ |
|
291 %only the last part we don't have an equality here. We use mkeps equivalence instead because these two regexes are indeed different syntactically. However, they have even more in common than just mkeps equal. We might later augment this proof so that this equivalence relation is changed to something like regular expression equivalence. |
258 This completes the proof. |
292 This completes the proof. |
259 \item{$r*$}\\ |
293 \item{$r*$}\\ |
260 s(r*) = s(r). |
294 s(r*) = s(r). |
261 \item{$r1.r2$}\\ |
295 \item{$r1.r2$}\\ |
262 using previous. |
296 using previous. |