16 \newcommand{\lemmaautorefname}{Lemma} |
16 \newcommand{\lemmaautorefname}{Lemma} |
17 |
17 |
18 \theoremstyle{definition} |
18 \theoremstyle{definition} |
19 \newtheorem{definition}{Definition} |
19 \newtheorem{definition}{Definition} |
20 \begin{document} |
20 \begin{document} |
21 This is a sketch proof for the correctness of the algorithm ders\_simp. |
21 This is an outline of the structure of the paper with a little bit of flesh in it. |
22 \section{Function Definitions} |
22 \section{The Flow of thought process} |
23 |
23 |
24 \begin{definition}{Bits} |
24 |
25 \begin{verbatim} |
25 \begin{definition}{Regular Expressions and values} |
26 abstract class Bit |
26 \begin{verbatim} |
27 case object Z extends Bit |
27 TODO |
28 case object S extends Bit |
28 \end{verbatim} |
29 case class C(c: Char) extends Bit |
29 \end{definition} |
30 |
30 |
31 type Bits = List[Bit] |
31 Value is.a parse tree for the regular expression matching the string. |
32 \end{verbatim} |
32 |
33 \end{definition} |
33 |
34 |
34 |
35 \begin{definition}{Annotated Regular Expressions} |
35 \begin{definition}{nullable} |
36 \begin{verbatim} |
36 \begin{verbatim} |
37 abstract class ARexp |
37 TODO |
38 case object AZERO extends ARexp |
38 \end{verbatim} |
39 case class AONE(bs: Bits) extends ARexp |
39 \end{definition} |
40 case class ACHAR(bs: Bits, f: Char) extends ARexp |
40 The idea behind nullable: whether it contains the empty string. |
41 case class AALTS(bs: Bits, rs: List[ARexp]) extends ARexp |
41 |
42 case class ASEQ(bs: Bits, r1: ARexp, r2: ARexp) extends ARexp |
42 \begin{definition}{derivatives} |
43 case class ASTAR(bs: Bits, r: ARexp) extends ARexp |
43 TODO |
44 \end{verbatim} |
44 |
45 \end{definition} |
45 \end{definition} |
46 |
46 |
47 \begin{definition}{bnullable} |
47 This definition can be used for matching algorithm. |
48 \begin{verbatim} |
48 |
49 def bnullable (r: ARexp) : Boolean = r match { |
49 \begin{definition}{matcher} |
50 case AZERO => false |
50 TODO |
51 case AONE(_) => true |
51 \end{definition} |
52 case ACHAR(_,_) => false |
52 |
53 case AALTS(_, rs) => rs.exists(bnullable) |
53 \begin{definition}{POSIX values} |
54 case ASEQ(_, r1, r2) => bnullable(r1) && bnullable(r2) |
54 TODO |
55 case ASTAR(_, _) => true |
55 \end{definition} |
56 } |
56 |
57 \end{verbatim} |
57 \begin{definition}{POSIX lexer algorithm} |
58 \end{definition} |
58 TODO |
59 |
59 \end{definition} |
60 \begin{definition}{ders\_simp} |
60 |
61 \begin{verbatim} |
61 \begin{definition}{POSIX lexer algorithm with simplification} |
62 def ders_simp(r: ARexp, s: List[Char]): ARexp = { |
62 TODO |
63 s match { |
63 \end{definition} |
64 case Nil => r |
64 This simplification algorithm is rather complicated as it entangles derivative, simplification and value reconstruction. We need to split the regular expression structure of the information for lexing so that simplifcation only changes the regex but does not destroy the information for reconstructing the resulting value.\\ |
65 case c::cs => ders_simp(bsimp(bder(c, r)), cs) |
65 |
66 } |
66 Introduce regex with auxiliary information: |
67 }\end{verbatim} |
67 |
68 \end{definition} |
68 \begin{definition}{annotated regular expression} |
|
69 TODO |
|
70 \end{definition} |
|
71 |
|
72 \begin{definition}{encoding} |
|
73 TODO |
|
74 \end{definition} |
|
75 Encoding translates values into bit codes with some information loss. |
|
76 |
|
77 \begin{definition}{decoding} |
|
78 TODO |
|
79 \end{definition} |
|
80 Decoding translates bitcodes back into values with the help of regex to recover the structure.\\ |
|
81 |
|
82 During different phases of lexing, we sometimes already know what the value would look like if we match the branch of regex with the string(e.g. a STAR with 1 more iteration, a left/right value), so we can partially encode the value at diffferent phases of the algorithm for later decoding. |
|
83 \\ |
|
84 Examples of such partial encoding: |
|
85 |
|
86 \begin{definition}{internalise} |
|
87 TODO |
|
88 \end{definition} |
|
89 When doing internalise on ALT:\\ |
|
90 Whichever branch is chosen, we know exactly the shape of the value, and therefore can get the bit code for such a value: Left corresponds to Z and Right to S |
|
91 |
|
92 \begin{definition}{bmkeps} |
|
93 TODO |
|
94 \end{definition} |
|
95 bmkeps on the STAR case:\\ |
|
96 We know there could be no more iteration for a star if we want to get a POSIX value for an empty string, so the value must be Stars [], corresponding to an S in the bit code. |
69 |
97 |
70 \begin{definition}{bder} |
98 \begin{definition}{bder} |
71 \begin{verbatim} |
99 TODO |
72 def bder(c: Char, r: ARexp) : ARexp = r match { |
100 \end{definition} |
73 case AZERO => AZERO |
101 SEQ case with the first regex nullable:\\ |
74 case AONE(_) => AZERO |
102 bmkeps will extract the value for how a1 mathces the empty string and encode it into a bit sequence. |
75 case ACHAR(bs, f) => if (c == f) AONE(bs:::List(C(c))) else AZERO |
103 |
76 case AALTS(bs, rs) => AALTS(bs, rs.map(bder(c, _))) |
104 \begin{definition}{blexer} |
77 case ASEQ(bs, r1, r2) => { |
105 \begin{verbatim} |
78 if (bnullable(r1)) AALT(bs, ASEQ(Nil, bder(c, r1), r2), fuse(mkepsBC(r1), bder(c, r2))) |
106 TODO |
79 else ASEQ(bs, bder(c, r1), r2) |
107 \end{verbatim} |
80 } |
108 \end{definition} |
81 case ASTAR(bs, r) => ASEQ(bs, fuse(List(S), bder(c, r)), ASTAR(Nil, r)) |
109 |
82 } |
110 adding simplifcation.\\ |
83 \end{verbatim} |
111 |
84 \end{definition} |
112 size of simplified regex: smaller than Antimirov's pder.\\ |
85 |
113 |
|
114 |
|
115 |
|
116 The rest of the document is the residual from a previous doc and may be deleted.\\ |
86 \begin{definition}{bsimp} |
117 \begin{definition}{bsimp} |
87 \begin{verbatim} |
118 \begin{verbatim} |
88 def bsimp(r: ARexp): ARexp = r match { |
119 def bsimp(r: ARexp): ARexp = r match { |
89 case ASEQ(bs1, r1, r2) => (bsimp(r1), bsimp(r2)) match { |
120 case ASEQ(bs1, r1, r2) => (bsimp(r1), bsimp(r2)) match { |
90 case (AZERO, _) => AZERO |
121 case (AZERO, _) => AZERO |