# HG changeset patch # User Christian Urban # Date 1494122489 -3600 # Node ID e28d7a327870af644be9f8855f96c591504c0961 # Parent 598741d39d21dbfd21082673fc026c12a5ca3298 updated diff -r 598741d39d21 -r e28d7a327870 handouts/ho03.pdf Binary file handouts/ho03.pdf has changed diff -r 598741d39d21 -r e28d7a327870 handouts/ho03.tex --- a/handouts/ho03.tex Sun May 07 00:20:58 2017 +0100 +++ b/handouts/ho03.tex Sun May 07 03:01:29 2017 +0100 @@ -570,7 +570,7 @@ \end{figure} The case for the sequence regular expression $r_1 \cdot r_2$ is a bit more -complicated: We are given by recursion two NFAs representing the regular +complicated: Say, we are given by recursion two NFAs representing the regular expressions $r_1$ and $r_2$ respectively. \begin{center} @@ -605,10 +605,11 @@ \end{center} \noindent The first NFA has some accepting states and the second some -starting states. We obtain $\epsilon$NFA for $r_1\cdot r_2$ by -connecting these accepting states with $\epsilon$-transitions to the -starting states of the second automaton. By doing so we make them -non-accepting like so: +starting states. We obtain an $\epsilon$NFA for $r_1\cdot r_2$ by +connecting the accepting states of the first NFA with +$\epsilon$-transitions to the starting states of the second +automaton. By doing so we make the accepting states of the first NFA +to be non-accepting like so: \begin{center} \begin{tikzpicture}[node distance=3mm, @@ -645,20 +646,37 @@ \end{tikzpicture} \end{center} -\noindent The case for the choice regular expression $r_1 + -r_2$ is slightly different: We are given by recursion two -automata representing $r_1$ and $r_2$ respectively. +\noindent The idea behind this construction is that the start of any +string is first recognised by the first NFA, then we silently change +to the second NFA; the ending of the string is recognised by the +second NFA...just like matching of a string by the regular expression +$r_1\cdot r_2$. The Scala code for this constrction is given in +Figure~\ref{thompson2} in Lines 16--23. The starting states of the +$\epsilon$NFA are the starting states of the first NFA (corresponding +to $r_1$); the accepting function is the accepting function of the +second NFA (corresponding to $r_2$). The new transition function is +all the ``old'' transitions plus the $\epsilon$-transitions connecting +the accepting states of the first NFA to the starting states of the +first NFA (Lines 18 and 19). The $\epsilon$NFA is then immedately +translated in a NFA. + + +The case for the choice regular expression $r_1 + r_2$ is slightly +different: We are given by recursion two NFAs representing $r_1$ and +$r_2$ respectively. \begin{center} \begin{tikzpicture}[node distance=3mm, >=stealth',very thick, every state/.style={minimum size=3pt,draw=blue!50,very thick,fill=blue!20},] \node at (0,0) (1) {$\mbox{}$}; -\node[state, initial] (2) [above right=16mm of 1] {$\mbox{}$}; -\node[state, initial] (3) [below right=16mm of 1] {$\mbox{}$}; +\node (2) [above=10mm of 1] {}; +\node[state, initial] (4) [above=1mm of 2] {$\mbox{}$}; +\node[state, initial] (5) [below=1mm of 2] {$\mbox{}$}; +\node[state, initial] (3) [below=10mm of 1] {$\mbox{}$}; -\node (a) [right=of 2] {$\ldots$}; -\node[state, accepting] (a1) [right=of a] {$\mbox{}$}; +\node (a) [right=of 2] {$\ldots\,$}; +\node (a1) [right=of a] {$$}; \node[state, accepting] (a2) [above=of a1] {$\mbox{}$}; \node[state, accepting] (a3) [below=of a1] {$\mbox{}$}; @@ -675,21 +693,24 @@ \end{tikzpicture} \end{center} -\noindent Each automaton has a single start state and -potentially several accepting states. We obtain a NFA for the -regular expression $r_1 + r_2$ by introducing a new starting -state and connecting it with an $\epsilon$-transition to the -two starting states above, like so +\noindent Each NFA has some starting states and some accepting +states. We obtain a NFA for the regular expression $r_1 + r_2$ +by composing the transition functions (this crucially depends +on knowing that the states of each component NFA are distinct); +and also combine the starting states and accepting functions: \begin{center} -\hspace{2cm}\begin{tikzpicture}[node distance=3mm, - >=stealth',very thick, every state/.style={minimum size=3pt,draw=blue!50,very thick,fill=blue!20},] -\node at (0,0) [state, initial] (1) {$\mbox{}$}; -\node[state] (2) [above right=16mm of 1] {$\mbox{}$}; -\node[state] (3) [below right=16mm of 1] {$\mbox{}$}; +\begin{tikzpicture}[node distance=3mm, + >=stealth',very thick, + every state/.style={minimum size=3pt,draw=blue!50,very thick,fill=blue!20},] +\node at (0,0) (1) {$\mbox{}$}; +\node (2) [above=10mm of 1] {$$}; +\node[state, initial] (4) [above=1mm of 2] {$\mbox{}$}; +\node[state, initial] (5) [below=1mm of 2] {$\mbox{}$}; +\node[state, initial] (3) [below=10mm of 1] {$\mbox{}$}; -\node (a) [right=of 2] {$\ldots$}; -\node[state, accepting] (a1) [right=of a] {$\mbox{}$}; +\node (a) [right=of 2] {$\ldots\,$}; +\node (a1) [right=of a] {$$}; \node[state, accepting] (a2) [above=of a1] {$\mbox{}$}; \node[state, accepting] (a3) [below=of a1] {$\mbox{}$}; @@ -698,8 +719,8 @@ \node[state, accepting] (b2) [above=of b1] {$\mbox{}$}; \node[state, accepting] (b3) [below=of b1] {$\mbox{}$}; -\path[->] (1) edge node [above] {$\epsilon$} (2); -\path[->] (1) edge node [below] {$\epsilon$} (3); +%\path[->] (1) edge node [above] {$\epsilon$} (2); +%\path[->] (1) edge node [below] {$\epsilon$} (3); \begin{pgfonlayer}{background} \node (3) [rounded corners, inner sep=1mm, thick, draw=black!60, fill=black!20, fit= (1) (a2) (a3) (b2) (b3)] {}; @@ -708,8 +729,8 @@ \end{tikzpicture} \end{center} -\noindent -Finally for the $*$-case we have an automaton for $r$ +\noindent The code for this construction is in Figure~\ref{thompson2} +in Lines 25--33. Finally for the $*$-case we have a NFA for $r$ \begin{center} \begin{tikzpicture}[node distance=3mm, @@ -727,14 +748,16 @@ \end{tikzpicture} \end{center} -\noindent and connect its accepting states to a new starting -state via $\epsilon$-transitions. This new starting state is -also an accepting state, because $r^*$ can recognise the -empty string. This gives the following automaton for $r^*$: +\noindent and connect its accepting states to a new starting state via +$\epsilon$-transitions. This new starting state is also an accepting +state, because $r^*$ can recognise the empty string. This gives the +following $\epsilon$NFA for $r^*$ (the corresponding code is in +Figure~\ref{thompson2} in Lines 35--43: \begin{center} \begin{tikzpicture}[node distance=3mm, - >=stealth',very thick, every state/.style={minimum size=3pt,draw=blue!50,very thick,fill=blue!20},] + >=stealth',very thick, + every state/.style={minimum size=3pt,draw=blue!50,very thick,fill=blue!20},] \node at (0,0) [state, initial,accepting] (1) {$\mbox{}$}; \node[state] (2) [right=16mm of 1] {$\mbox{}$}; \node (a) [right=of 2] {$\ldots$}; @@ -752,6 +775,15 @@ \end{tikzpicture} \end{center} + +To sum ap, you can see in the sequence and star cases the need of +having silent $\epsilon$-transitions. Similarly the alternative case +shows the need of the NFA-nondeterminsim. It seems awkward to form the +`alternative' composition of two DFAs, because DFA do not allow +several starting and successor states. All these constructions can now +be put together in the following recursive function: + + {\small\begin{lstlisting}[language=Scala,linebackgroundcolor= {\ifodd\value{lstnumber}\color{capri!3}\fi}] def thompson (r: Rexp) : NFAt = r match { @@ -764,11 +796,55 @@ } \end{lstlisting}} +\noindent +It calculates a NFA from a regular expressions. At last we can run a +NFA for the our evil regular expression examples. + \begin{center} \begin{tabular}{@{\hspace{-1mm}}c@{\hspace{1mm}}c@{}} \begin{tikzpicture} \begin{axis}[ + title={Graph: $\texttt{a?\{n\}\,a{\{n\}}}$ and strings + $\underbrace{\texttt{a}\ldots \texttt{a}}_{n}$}, + xlabel={$n$}, + x label style={at={(1.05,0.0)}}, + ylabel={time in secs}, + enlargelimits=false, + xtick={0,5,...,30}, + xmax=33, + ymax=35, + ytick={0,5,...,30}, + scaled ticks=false, + axis lines=left, + width=5.5cm, + height=4.5cm, + legend entries={Python,Ruby}, + legend pos=south east, + legend cell align=left] + \addplot[blue,mark=*, mark options={fill=white}] table {re-python.data}; + \addplot[brown,mark=triangle*, mark options={fill=white}] table {re-ruby.data}; + % breath-first search in NFAs + \addplot[red,mark=*, mark options={fill=white}] table { + 1 0.00586 + 2 0.01209 + 3 0.03076 + 4 0.08269 + 5 0.12881 + 6 0.25146 + 7 0.51377 + 8 0.89079 + 9 1.62802 + 10 3.05326 + 11 5.92437 + 12 11.67863 + 13 24.00568 + }; +\end{axis} +\end{tikzpicture} +& +\begin{tikzpicture} +\begin{axis}[ title={Graph: $\texttt{(a*)*\,b}$ and strings $\underbrace{\texttt{a}\ldots \texttt{a}}_{n}$}, xlabel={$n$}, @@ -800,50 +876,10 @@ }; \end{axis} \end{tikzpicture} -& -\begin{tikzpicture} -\begin{axis}[ - title={Graph: $\texttt{a?\{n\}\,a{\{n\}}}$ and strings - $\underbrace{\texttt{a}\ldots \texttt{a}}_{n}$}, - xlabel={$n$}, - x label style={at={(1.05,0.0)}}, - ylabel={time in secs}, - enlargelimits=false, - xtick={0,5,...,30}, - xmax=33, - ymax=35, - ytick={0,5,...,30}, - scaled ticks=false, - axis lines=left, - width=5.5cm, - height=4.5cm, - legend entries={Python,Ruby}, - legend pos=south east, - legend cell align=left] - \addplot[blue,mark=*, mark options={fill=white}] table {re-python.data}; - \addplot[brown,mark=triangle*, mark options={fill=white}] table {re-ruby.data}; - % breath-first search in NFAs - \addplot[red,mark=*, mark options={fill=white}] table { - 1 0.00741 - 2 0.02657 - 3 0.08317 - 4 0.22133 - 5 0.54463 - 6 1.42062 - 7 4.04926 - 8 12.70395 - 9 39.21555 - 10 112.05466 - }; -\end{axis} -\end{tikzpicture} \end{tabular} \end{center} -\noindent This construction of a NFA from a regular expression -was invented by Ken Thompson in 1968. - \subsubsection*{Subset Construction} diff -r 598741d39d21 -r e28d7a327870 progs/display/thompson2.scala --- a/progs/display/thompson2.scala Sun May 07 00:20:58 2017 +0100 +++ b/progs/display/thompson2.scala Sun May 07 03:01:29 2017 +0100 @@ -24,13 +24,12 @@ // alternative of two NFAs def NFA_ALT(enfa1: NFAt, enfa2: NFAt) : NFAt = { - val Q = TState() - val new_delta : eNFAtrans = - { case (Q, None) => enfa1.starts | enfa2.starts } + val new_delta : NFAtrans = { + case (q, c) => applyOrElse(enfa1.delta, (q, c)) | + applyOrElse(enfa2.delta, (q, c)) } val new_fins = (q: TState) => enfa1.fins(q) || enfa2.fins(q) - eNFA(Set(Q), new_delta +++ enfa1.delta +++ enfa2.delta, - new_fins) + NFA(enfa1.starts | enfa2.starts, new_delta, new_fins) } // star of a NFA diff -r 598741d39d21 -r e28d7a327870 progs/nfa.scala --- a/progs/nfa.scala Sun May 07 00:20:58 2017 +0100 +++ b/progs/nfa.scala Sun May 07 03:01:29 2017 +0100 @@ -85,4 +85,4 @@ // subset constructions -def subset(nfa: NFA[A, C]) : DFA[Set[A], C] = +//def subset(nfa: NFA[A, C]) : DFA[Set[A], C] = diff -r 598741d39d21 -r e28d7a327870 progs/thompson.scala --- a/progs/thompson.scala Sun May 07 00:20:58 2017 +0100 +++ b/progs/thompson.scala Sun May 07 03:01:29 2017 +0100 @@ -60,12 +60,12 @@ // alternative of two NFAs def NFA_ALT(enfa1: NFAt, enfa2: NFAt) : NFAt = { - val Q = TState() - val new_delta : eNFAtrans = - { case (Q, None) => enfa1.starts | enfa2.starts } + val new_delta : NFAtrans = { + case (q, c) => applyOrElse(enfa1.delta, (q, c)) | + applyOrElse(enfa2.delta, (q, c)) } val new_fins = (q: TState) => enfa1.fins(q) || enfa2.fins(q) - eNFA(Set(Q), new_delta +++ enfa1.delta +++ enfa2.delta, new_fins) + NFA(enfa1.starts | enfa2.starts, new_delta, new_fins) } // star of a NFA @@ -124,10 +124,10 @@ // the evil regular expression a?{n} a{n} -def EVIL1(n: Int) = SEQ(NTIMES(OPT(CHAR('a')), n), NTIMES(CHAR('a'), n)) +def EVIL1(n: Int) : Rexp = SEQ(NTIMES(OPT(CHAR('a')), n), NTIMES(CHAR('a'), n)) // the evil regular expression (a*)*b -val EVIL2 = SEQ(STAR(STAR(CHAR('a'))), CHAR('b')) +val EVIL2 : Rexp = SEQ(STAR(STAR(CHAR('a'))), CHAR('b')) //for measuring time def time_needed[T](i: Int, code: => T) = { @@ -140,11 +140,11 @@ // the size of the NFA can be large, // thus slowing down the breadth-first search -for (i <- 1 to 10) { +for (i <- 1 to 13) { println(i + ": " + "%.5f".format(time_needed(2, tmatches(EVIL1(i), "a" * i)))) } -for (i <- 1 to 10) { +for (i <- 1 to 100 by 5) { println(i + " " + "%.5f".format(time_needed(2, tmatches(EVIL2, "a" * i)))) }