afl-material: comparison handouts/ho05.tex

equal deleted inserted replaced

-:ddcb616e036a
+:aaf0bd0a211d
 % !TEX program = xelatex
 \documentclass{article}
 \usepackage{../style}
 \usepackage{../langs}
 \usepackage{../grammar}
+\usepackage{../graphics}
 % epsilon and left-recursion elimination
 % http://www.mollypages.org/page/grammar/index.mp
 %% parsing scala files
 : \meta{B} ::= 0 \cdot \meta{B'} | 1 \cdot \meta{B'}\\
 \end{plstx}
 \noindent To follow the general scheme of the transfromation,
 the $\alpha$ is either is either $0$ or $1$, and the $\beta$ happens
-to be empty. SO we need to generate new rules for the form
+to be empty. So we need to generate new rules for the form
 \mbox{$\meta{A} := \alpha\cdot\beta$}, which in our particular
 example means we obtain
 \begin{plstx}[margin=1cm]
 : \meta{B} ::= 0 \cdot \meta{B'} | 1 \cdot \meta{B'} | 0 | 1\\
 \noindent
 is recognised by the grammar. The CYK algorithm starts with the
 following triangular data structure.
-\begin{figure}[t]
+%%\begin{figure}[t]
 \begin{center}
-\begin{tikzpicture}[scale=0.8,line width=0.8mm]
+\begin{tikzpicture}[scale=0.7,line width=0.8mm]
 \draw (-2,0) -- (4,0);
 \draw (-2,1) -- (4,1);
 \draw (-2,2) -- (3,2);
 \draw (-2,3) -- (2,3);
 \draw (-2,4) -- (1,4);
 \draw ( 0.5,0.5) node {$N,V$};
 \draw ( 1.5,0.5) node {$A$};
 \draw ( 2.5,0.5) node {$N$};
 \draw ( 3.5,0.5) node {$N,V$};
+%  \draw (-1.5,1.5) node {\small{}a};
+%  \draw (-0.5,1.5) node {\small{}b};
+%  \draw ( 0.5,1.5) node {\small{}c};
+%  \draw ( 1.5,1.5) node {\small{}d};
+%  \draw ( 2.5,1.5) node {\small{}e};
 \draw (-2.4, 5.5) node {$1$};
 \draw (-2.4, 4.5) node {$2$};
 \draw (-2.4, 3.5) node {$3$};
 \draw (-2.4, 2.5) node {$4$};
 \draw (-2.4, 1.5) node {$5$};
 \draw (-2.4, 0.5) node {$6$};
 \end{tikzpicture}
 \end{center}
-\end{figure}
+%%\end{figure}
+\noindent
+The last row contains the information about all words and their
+corresponding non-terminals. For example the field for \texttt{trains}
+contains the information $\meta{N}$ and $\meta{V}$ because it can be a
+``verb'' and a ``noun'' according to the grammar.  The row above,
+let's call the corresponding fields 5a to 5e, contains information
+about 2-word parts of the sentence, namely
+\begin{center}
+\begin{tabular}{llll}
+5a) & $\underbrace{\texttt{The}}_{A}$ $\mid$ $\underbrace{\texttt{trainer}}_{N}$   \\
+5b) & $\underbrace{\texttt{trainer}}_{N}$ $\mid$ $\underbrace{\texttt{trains}}_{N,V}$\\
+5c) & \texttt{trains} $\mid$ \texttt{the}    \\
+5d) & \texttt{the} $\mid$ \texttt{student}   \\
+5e) & \texttt{student} $\mid$ \texttt{team}  \\
+\end{tabular}
+\end{center}
+\noindent
+For each of them, we look up in row 6 which non-terminals it belongs to
+(indicated above for 5a and 5b). For 5a, with the non-terminals
+\meta{A} and \meta{N}, we find the grammar rule
+\begin{plstx}[margin=1cm]
+: \meta{N} ::= \meta{A}\cdot \meta{N}\\
+\end{plstx}
+\noindent
+which means into field 5a we put the left-hand side of this rule,
+which in this case is the non-terminal \meta{N}. For 5b we have to check
+for both $\meta{N}\cdot\meta{N}$ and $\meta{N}\cdot\meta{V}$ whether there
+is a right-hand side of this form in the grammar. But only the
+grammar rule
+\begin{plstx}[margin=1cm]
+: \meta{N} ::= \meta{N}\cdot \meta{N}\\
+\end{plstx}
+\noindent
+matches, which means 5b gets also an \meta{N}. Continuing for all
+fields in row 5 gives:
+\begin{center}
+\begin{tikzpicture}[scale=0.7,line width=0.8mm]
+\draw (-2,0) -- (4,0);
+\draw (-2,1) -- (4,1);
+\draw (-2,2) -- (3,2);
+\draw (-2,3) -- (2,3);
+\draw (-2,4) -- (1,4);
+\draw (-2,5) -- (0,5);
+\draw (-2,6) -- (-1,6);
+\draw (0,0) -- (0, 5);
+\draw (1,0) -- (1, 4);
+\draw (2,0) -- (2, 3);
+\draw (3,0) -- (3, 2);
+\draw (4,0) -- (4, 1);
+\draw (-1,0) -- (-1, 6);
+\draw (-2,0) -- (-2, 6);
+\draw (-1.5,-0.5) node {\footnotesize{}\texttt{The}};
+\draw (-0.5,-1.0) node {\footnotesize{}\texttt{trainer}};
+\draw ( 0.5,-0.5) node {\footnotesize{}\texttt{trains}};
+\draw ( 1.5,-1.0) node {\footnotesize{}\texttt{the}};
+\draw ( 2.5,-0.5) node {\footnotesize{}\texttt{student}};
+\draw ( 3.5,-1.0) node {\footnotesize{}\texttt{team}};
+\draw (-1.5,0.5) node {$A$};
+\draw (-0.5,0.5) node {$N$};
+\draw ( 0.5,0.5) node {$N,V$};
+\draw ( 1.5,0.5) node {$A$};
+\draw ( 2.5,0.5) node {$N$};
+\draw ( 3.5,0.5) node {$N,V$};
+\draw (-1.5,1.5) node {$N$};
+\draw (-0.5,1.5) node {$N$};
+\draw ( 0.5,1.5) node {$$};
+\draw ( 1.5,1.5) node {$N$};
+\draw ( 2.5,1.5) node {$N$};
+%  \draw (-1.5,1.5) node {\small{}a};
+%  \draw (-0.5,1.5) node {\small{}b};
+%  \draw ( 0.5,1.5) node {\small{}c};
+%  \draw ( 1.5,1.5) node {\small{}d};
+%  \draw ( 2.5,1.5) node {\small{}e};
+\draw (-2.4, 5.5) node {$1$};
+\draw (-2.4, 4.5) node {$2$};
+\draw (-2.4, 3.5) node {$3$};
+\draw (-2.4, 2.5) node {$4$};
+\draw (-2.4, 1.5) node {$5$};
+\draw (-2.4, 0.5) node {$6$};
+\end{tikzpicture}
+\end{center}
+\noindent
+Now row 4 is in charge of all 3-word parts of the sentence, namely
+\begin{center}
+\begin{tabular}{llll}
+4a) & The $\mid$ trainer trains\\
+& The trainer $\mid$ trains\\
+4b) & trainer $\mid$ trains the\\
+& trainer trains $\mid$ the\\
+4c) & trains $\mid$ the student\\
+& trains the $\mid$ student\\
+4d) & the $\mid$ student team\\
+& the student $\mid$ team\\
+\end{tabular}
+\end{center}
+\noindent
+Note that in case of 3-word parts we have two splits. For example for
+4a: $\underbrace{\texttt{The}}_{A}$ and
+$\underbrace{\texttt{trainer trains}}_{N}$; and also
+$\underbrace{\texttt{The trainer}}_{N}$ and
+$\underbrace{\texttt{trains}}_{N,V}$. For each of these splits we have
+to look up in the rows below, which non-terminals we already
+computed. This allows us to look for right-hand sides in our grammar:
+$\meta{A}\cdot\meta{N}$, $\meta{N}\cdot\meta{N}$ and
+$\meta{N}\cdot\meta{V}$, which yield the only left-hand side
+\meta{N}. This is what we fill in for 4a. And so on for row 4.
+Row 3 is about all 4-word parts in the sentence, namely
+\begin{center}
+\begin{tabular}{llll}
+3a) & The trainer trains the\\
+3b) & trainer trains the student\\
+3c) & trains the student team\\
+\end{tabular}
+\end{center}
+\noindent
+Each of them can be split up in 3 ways, for example
+\begin{center}
+\begin{tabular}{llll}
+3a) & The $\mid$ trainer trains the\\
+& The trainer $\mid$ trains the\\
+& The trainer trains $\mid$ the\\
+\end{tabular}
+\end{center}
+\noindent
+and we have to consider them all in turn to fill in the non-terminals for
+3a. You can guess how it continues: row 2 is for all 5-word parts, and finally
+the field on the top is for the whole sentence.
+The idea of the CYK algorithm is that if in the top-field the starting
+symbol \meta{S} appears (possibly together with other non-terminals),
+then the sentence is accepted by the grammar. If it does not, then the
+sentence is not accepted.
+Let us very quickly calculate the complexity of the CYK
+algorithm. Lookup operations inside the triangle and in the grammar
+are assumed to be of constant time, $O(1)$, meaning they do not
+matter. How many fields are in the triangle\ldots
+$\frac{n}{2} * (n+1)$, where $n$ is the size of the input. That means
+roughly $O(n^2)$ fields. How much work do we have to do for each
+field? Well, for the top-most we have to consider $n-1$ splits, which
+means roughly $O(n)$ for each field. The overall result is a $O(n^3)$
+time-complexity for CYK. It turns out that this is the best worst-time
+complexity for parsing with context-free grammars.
 \end{document}
 %%% Parser combinators are now part of handout 6

changeset 798	aaf0bd0a211d
parent 727	eb9343126625
child 937	dc5ab66b11cc