lexing: changeset 440:0856fbf8bda7

--- a/ChengsongPhdThesis/ChengsongPhDThesis.tex	Wed Mar 02 23:23:19 2022 +0000
+++ b/ChengsongPhdThesis/ChengsongPhDThesis.tex	Wed Mar 02 23:53:11 2022 +0000
@@ -95,9 +95,39 @@
 \end{abstract}
 
 \section{Introduction}
-\subsection{Practical Example of Regex}
+\subsection{Basic Regex Introduction}
+Suppose (basic) regular expressions are given by the following grammar:
+\[			r ::=   \ZERO \mid  \ONE
+			 \mid  c  
+			 \mid  r_1 \cdot r_2
+			 \mid  r_1 + r_2   
+			 \mid r^*         
+\]
+
+Problem of matching:
 
-%TODO: read rules libraries and the explanation for some of the rules
+\begin{center}
+\begin{tabular}{lcr}
+$\textit{Match}(r, s)$ & $ =  $ & $\textit{if}\; s \in L(r)\; \textit{output} \; \textit{YES}$\\
+&				& $\textit{else} \; \textit{output} \; \textit{NO}$
+\end{tabular}
+\end{center}
+
+
+\subsection{The practical problem}
+Introduce the problem of matching a pattern with a string.
+Why important?
+Need to be fast, real-time, on large inputs.
+Examples: Snort, Bro, etc?
+\subsubsection{The rules for network intrusion analysis tools }
+TODO: read rules libraries such as Snort and the explanation for some of the rules
+TODO: pcre/pcre2?
+TODO: any other libraries?
+
+
+\subsection{Regexes that brought down CloudFlare}
+
+
 matching some string $s$ with a regex
 
 \begin{verbatim}
@@ -111,37 +141,75 @@
 %Checking whether there is some malicious code 
 %in the network data blocks being routed.
 %If so, discard the data and identify the sender for future alert.
+\section{Existing approaches}
+\subsection{Shortcomings of different methods}
 
-\subsection{The problem: Efficient Matching and Lexing}
-A programmer writes patterns to process texts,
-where a regex is a structured symbolic pattern
-specifying what the string should be like.
+
+\subsubsection{ NFA's}
+Can be slow especially when many states are active.
 
-The above regex looks complicated, but can be 
-described by some basic constructs:
+\subsubsection{DFAs}
+The tool JFLEX uses it.
+Advantages: super fast on most regexes \\
+TODO: show it being fast on a lot of inputs\\
+Disavantages:
+state explosion for bounded repetitions due to 
+theoretic bottleneck of having to remember exactly what the
+suffix up to length $n$ of input string is.
+"Countdown States activation problem":
+$.*a.{100}$ requires $2^100$ + DFA states.
+\subsubsection{variant of DFA's}
+counting set automata 
+\\
+TODO: microsoft 2020 oopsla CsA work, need to add bibli entry, and read, understand key novelty, learn to benchmark like it
+\\
+TODO: find weakness of such counting set automata?
+\\
+Other variants?
+
+\subsubsection{NFA and Regex: isomorphic structure}
+TODO: define mathematically an isomorphism?\\
+
 
 
-Suppose (basic) regular expressions are given by the following grammar:
-\[			r ::=   \ZERO \mid  \ONE
-			 \mid  c  
-			 \mid  r_1 \cdot r_2
-			 \mid  r_1 + r_2   
-			 \mid r^*         
-\]
+\subsubsection{variants of NFA's}
+How about acting on regular expressions themselves? Certain copies represent verbose info--that they will always match the same string--prune away!
+
+\subsection{Brzozowski's derivatives}
+
+\subsection{Sulzmann and Lu's algorithm}
+
+\subsection{Bit-coded algorithm}
++bitcodes!
+Built on top of derivatives, but with auxiliary bits
+\subsection{Correctness Proof}
+unproven by Sulzmann  and Lu
+by Ausaf and Urban
+
+\section{My Work}
+
+\subsection{an improved version of bit-coded algorithm: with simp!}
 
-\noindent
-The intended meaning of the constructors is as follows: $\ZERO$
-cannot match any string, $\ONE$ can match the empty string, the
-character regular expression $c$ can match the character $c$, and so
-on.
+\subsection{a correctness proof for bitcoded}
+
+\subsection{finiteness proof }
+
+\subsection{stronger simplification}
+
+
+\subsection{cubic bound}
+
 
-and the underlying algorithmic problem is:
-\begin{center}
-\begin{tabular}{lcr}
-$\textit{Match}(r, s)$ & $ =  $ & $\textit{if}\; s \in L(r)\; \textit{output} \; \textit{YES}$\\
-&				& $\textit{else} \; \textit{output} \; \textit{NO}$
-\end{tabular}
-\end{center}
+
+
+
+
+
+
+
+
+
+\subsection{Too Detailed too basic? regex to NFA translation}
 
 Deciding whether a string is in the language of the regex
 can be intuitively done by constructing an NFA\cite{Thompson_1968}:
author	Chengsong
	Wed, 02 Mar 2022 23:53:11 +0000
changeset 440	0856fbf8bda7
parent 439	a5376206fd52
child 441	426a93160f4a