s?
authorChengsong
Wed, 03 Jul 2019 22:39:47 +0100
changeset 43 52a3cec0a5c7
parent 42 2a469388c989
child 44 4d674a971852
s?
ninems/ninems.tex
ninems/root.bib
--- a/ninems/ninems.tex	Wed Jul 03 22:06:47 2019 +0100
+++ b/ninems/ninems.tex	Wed Jul 03 22:39:47 2019 +0100
@@ -477,7 +477,11 @@
 
 \section{Simplification of Regular Expressions}
 Using bit-codes to guide  parsing is not a new idea.
+It was applied to context free grammars and then adapted by Henglein and Nielson for efficient regular expression parsing \cite{nielson11bcre}. Sulzmann and Lu took a step further by intergrating bitcodes into derivatives.
 
+The argument for complicating the data structures from basic regular expressions to those with bitcodes
+is that we can introduce simplification without making the algorithm crash or impossible to reason about.
+The reason why we need simplification is due to the shortcoming of a naive algorithm using Brzozowski's definition only. 
 The main drawback of building successive derivatives according to
 Brzozowski's definition is that they can grow very quickly in size.
 This is mainly due to the fact that the derivative operation generates
@@ -514,26 +518,13 @@
 %data, that a similar bound can be obtained for the derivatives in
 %Sulzmann and Lu's algorithm. Let us give some details about this next.
 
-We first followed Sulzmann and Lu's idea of introducing
-\emph{annotated regular expressions}~\cite{Sulzmann2014}. They are
-defined by the following grammar:
-
-\begin{center}
-\begin{tabular}{lcl}
-  $\textit{a}$ & $::=$  & $\textit{ZERO}$\\
-                  & $\mid$ & $\textit{ONE}\;\;bs$\\
-                  & $\mid$ & $\textit{CHAR}\;\;bs\,c$\\
-                  & $\mid$ & $\textit{ALTS}\;\;bs\,as$\\
-                  & $\mid$ & $\textit{SEQ}\;\;bs\,a_1\,a_2$\\
-                  & $\mid$ & $\textit{STAR}\;\;bs\,a$
-\end{tabular}    
-\end{center}  
-
-\noindent
-where $bs$ stands for bitsequences, and $as$ (in \textit{ALTS}) for a
-list of annotated regular expressions. These bitsequences encode
-information about the (POSIX) value that should be generated by the
-Sulzmann and Lu algorithm. Bitcodes are essentially incomplete values.
+Bit-codes look like this:
+\[			b ::=   S \mid  Z \; \;\;
+bs ::= [] \mid b:bs    
+\]
+They are just a string of bits, the names "S" and "Z"  here are kind of arbitrary, we can use 0 and 1 or binary symbol to substitute them. They are a compact form of parse trees.
+Here is how values and bit-codes are related:
+Bitcodes are essentially incomplete values.
 This can be straightforwardly seen in the following transformation: 
 \begin{center}
 \begin{tabular}{lcl}
@@ -579,6 +570,27 @@
 \end{center}    
 %\end{definition}
 
+
+Sulzmann and Lu's integrated the bitcodes into annotated regular expressions by attaching them to the head of every substructure of a regularexpression\emph{annotated regular expressions}~\cite{Sulzmann2014}. They are
+defined by the following grammar:
+
+\begin{center}
+\begin{tabular}{lcl}
+  $\textit{a}$ & $::=$  & $\textit{ZERO}$\\
+                  & $\mid$ & $\textit{ONE}\;\;bs$\\
+                  & $\mid$ & $\textit{CHAR}\;\;bs\,c$\\
+                  & $\mid$ & $\textit{ALTS}\;\;bs\,as$\\
+                  & $\mid$ & $\textit{SEQ}\;\;bs\,a_1\,a_2$\\
+                  & $\mid$ & $\textit{STAR}\;\;bs\,a$
+\end{tabular}    
+\end{center}  
+
+\noindent
+where $bs$ stands for bitsequences, and $as$ (in \textit{ALTS}) for a
+list of annotated regular expressions. These bitsequences encode
+information about the (POSIX) value that should be generated by the
+Sulzmann and Lu algorithm. 
+
 To do lexing using annotated regular expressions, we shall first transform the
 usual (un-annotated) regular expressions into annotated regular
 expressions:\\
@@ -598,7 +610,9 @@
 \end{tabular}    
 \end{center}    
 %\end{definition}
-Then we do successive derivative operations on the annotated regular expression. This derivative operation is the same as what we previously have for the simple regular expressions, except that we take special care of the bits to store the parse tree information:\\
+After internalise we do successive derivative operations on the annotated regular expression.
+Here $fuse$ is an auxiliary  function that helps to attach bits to the front of an annotated regular expression.
+ This derivative operation is the same as what we previously have for the simple regular expressions, except that we take special care of the bits to store the parse tree information:\\
 %\begin{definition}{bder}
 \begin{center}
   \begin{tabular}{@{}lcl@{}}
--- a/ninems/root.bib	Wed Jul 03 22:06:47 2019 +0100
+++ b/ninems/root.bib	Wed Jul 03 22:39:47 2019 +0100
@@ -1,13 +1,22 @@
 %% This BibTeX bibliography file was created using BibDesk.
 %% https://bibdesk.sourceforge.io/
 
-%% Created for CS TAN at 2019-06-26 17:07:31 +0100 
+%% Created for CS TAN at 2019-07-03 22:17:58 +0100 
 
 
 %% Saved with string encoding Unicode (UTF-8) 
 
 
 
+@article{nielson11bcre,
+	Author = { Lasse Nielsen, Fritz Henglein},
+	Date-Added = {2019-07-03 21:09:39 +0000},
+	Date-Modified = {2019-07-03 21:17:33 +0000},
+	Journal = {LATA},
+	Title = {Bit-coded Regular Expression Parsing},
+	Year = {2011},
+	Bdsk-File-1 = {YnBsaXN0MDDUAQIDBAUGJCVYJHZlcnNpb25YJG9iamVjdHNZJGFyY2hpdmVyVCR0b3ASAAGGoKgHCBMUFRYaIVUkbnVsbNMJCgsMDxJXTlMua2V5c1pOUy5vYmplY3RzViRjbGFzc6INDoACgAOiEBGABIAFgAdccmVsYXRpdmVQYXRoWWFsaWFzRGF0YV8QGC4uLy4uLy4uL2ZyaXR6LXBhcGVyLnBkZtIXCxgZV05TLmRhdGFPEQFGAAAAAAFGAAIAAAxNYWNpbnRvc2ggSEQAAAAAAAAAAAAAAAAAAAAAAAAAQkQAAf////8PZnJpdHotcGFwZXIucGRmAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA/////wAAAAAAAAAAAAAAAAADAAEAAAogY3UAAAAAAAAAAAAAAAAAB0Rlc2t0b3AAAAIAJS86VXNlcnM6Y3N0YW46RGVza3RvcDpmcml0ei1wYXBlci5wZGYAAA4AIAAPAGYAcgBpAHQAegAtAHAAYQBwAGUAcgAuAHAAZABmAA8AGgAMAE0AYQBjAGkAbgB0AG8AcwBoACAASABEABIAI1VzZXJzL2NzdGFuL0Rlc2t0b3AvZnJpdHotcGFwZXIucGRmAAATAAEvAAAVAAIADP//AACABtIbHB0eWiRjbGFzc25hbWVYJGNsYXNzZXNdTlNNdXRhYmxlRGF0YaMdHyBWTlNEYXRhWE5TT2JqZWN00hscIiNcTlNEaWN0aW9uYXJ5oiIgXxAPTlNLZXllZEFyY2hpdmVy0SYnVHJvb3SAAQAIABEAGgAjAC0AMgA3AEAARgBNAFUAYABnAGoAbABuAHEAcwB1AHcAhACOAKkArgC2AgACAgIHAhICGwIpAi0CNAI9AkICTwJSAmQCZwJsAAAAAAAAAgEAAAAAAAAAKAAAAAAAAAAAAAAAAAAAAm4=}}
+
 @misc{SE16,
 	Author = {StackStatus},
 	Date-Added = {2019-06-26 11:28:41 +0000},