# HG changeset patch # User Christian Urban # Date 1604022303 0 # Node ID aaf0bd0a211d9f6d57d307fe749dafeb701f4f50 # Parent ddcb616e036a595e0d8486942fbd1523ccfd9a50 updated diff -r ddcb616e036a -r aaf0bd0a211d graphics.sty --- a/graphics.sty Thu Oct 29 00:25:20 2020 +0000 +++ b/graphics.sty Fri Oct 30 01:45:03 2020 +0000 @@ -22,3 +22,30 @@ \bgroup\begin{minipage}{#1}\raggedright{}} {\end{minipage}\egroup;% \end{tikzpicture}\bigskip} + + +%%% for trees +%% http://anorien.csc.warwick.ac.uk/mirrors/CTAN/graphics/pgf/contrib/forest/forest.pdf + +\newcommand\grid[1]{% +\begin{tikzpicture}[baseline=(char.base)] + \path[use as bounding box] + (0,0) rectangle (1em,1em); + \draw[red!50, fill=red!20] + (0,0) rectangle (1em,1em); + \node[inner sep=1pt,anchor=base west] + (char) at (0em,\gridraiseamount) {#1}; +\end{tikzpicture}} +\newcommand\gridraiseamount{0.12em} + +\makeatletter +\newcommand\Grid[1]{% + \@tfor\z:=#1\do{\grid{\z}}} +\makeatother + +\newcommand\Vspace[1][.3em]{% + \mbox{\kern.06em\vrule height.3ex}% + \vbox{\hrule width#1}% + \hbox{\vrule height.3ex}} + +\def\VS{\Vspace[0.6em]} \ No newline at end of file diff -r ddcb616e036a -r aaf0bd0a211d handouts/ho05.pdf Binary file handouts/ho05.pdf has changed diff -r ddcb616e036a -r aaf0bd0a211d handouts/ho05.tex --- a/handouts/ho05.tex Thu Oct 29 00:25:20 2020 +0000 +++ b/handouts/ho05.tex Fri Oct 30 01:45:03 2020 +0000 @@ -3,6 +3,7 @@ \usepackage{../style} \usepackage{../langs} \usepackage{../grammar} +\usepackage{../graphics} % epsilon and left-recursion elimination % http://www.mollypages.org/page/grammar/index.mp @@ -419,7 +420,7 @@ \noindent To follow the general scheme of the transfromation, the $\alpha$ is either is either $0$ or $1$, and the $\beta$ happens -to be empty. SO we need to generate new rules for the form +to be empty. So we need to generate new rules for the form \mbox{$\meta{A} := \alpha\cdot\beta$}, which in our particular example means we obtain @@ -509,9 +510,9 @@ is recognised by the grammar. The CYK algorithm starts with the following triangular data structure. -\begin{figure}[t] +%%\begin{figure}[t] \begin{center} - \begin{tikzpicture}[scale=0.8,line width=0.8mm] + \begin{tikzpicture}[scale=0.7,line width=0.8mm] \draw (-2,0) -- (4,0); \draw (-2,1) -- (4,1); \draw (-2,2) -- (3,2); @@ -542,6 +543,12 @@ \draw ( 2.5,0.5) node {$N$}; \draw ( 3.5,0.5) node {$N,V$}; +% \draw (-1.5,1.5) node {\small{}a}; +% \draw (-0.5,1.5) node {\small{}b}; +% \draw ( 0.5,1.5) node {\small{}c}; +% \draw ( 1.5,1.5) node {\small{}d}; +% \draw ( 2.5,1.5) node {\small{}e}; + \draw (-2.4, 5.5) node {$1$}; \draw (-2.4, 4.5) node {$2$}; \draw (-2.4, 3.5) node {$3$}; @@ -550,7 +557,173 @@ \draw (-2.4, 0.5) node {$6$}; \end{tikzpicture} \end{center} -\end{figure} +%%\end{figure} + + +\noindent +The last row contains the information about all words and their +corresponding non-terminals. For example the field for \texttt{trains} +contains the information $\meta{N}$ and $\meta{V}$ because it can be a +``verb'' and a ``noun'' according to the grammar. The row above, +let's call the corresponding fields 5a to 5e, contains information +about 2-word parts of the sentence, namely + +\begin{center} +\begin{tabular}{llll} + 5a) & $\underbrace{\texttt{The}}_{A}$ $\mid$ $\underbrace{\texttt{trainer}}_{N}$ \\ + 5b) & $\underbrace{\texttt{trainer}}_{N}$ $\mid$ $\underbrace{\texttt{trains}}_{N,V}$\\ + 5c) & \texttt{trains} $\mid$ \texttt{the} \\ + 5d) & \texttt{the} $\mid$ \texttt{student} \\ + 5e) & \texttt{student} $\mid$ \texttt{team} \\ +\end{tabular} +\end{center} + +\noindent +For each of them, we look up in row 6 which non-terminals it belongs to +(indicated above for 5a and 5b). For 5a, with the non-terminals +\meta{A} and \meta{N}, we find the grammar rule + +\begin{plstx}[margin=1cm] + : \meta{N} ::= \meta{A}\cdot \meta{N}\\ +\end{plstx} + +\noindent +which means into field 5a we put the left-hand side of this rule, +which in this case is the non-terminal \meta{N}. For 5b we have to check +for both $\meta{N}\cdot\meta{N}$ and $\meta{N}\cdot\meta{V}$ whether there +is a right-hand side of this form in the grammar. But only the +grammar rule + +\begin{plstx}[margin=1cm] + : \meta{N} ::= \meta{N}\cdot \meta{N}\\ +\end{plstx} + +\noindent +matches, which means 5b gets also an \meta{N}. Continuing for all +fields in row 5 gives: + +\begin{center} + \begin{tikzpicture}[scale=0.7,line width=0.8mm] + \draw (-2,0) -- (4,0); + \draw (-2,1) -- (4,1); + \draw (-2,2) -- (3,2); + \draw (-2,3) -- (2,3); + \draw (-2,4) -- (1,4); + \draw (-2,5) -- (0,5); + \draw (-2,6) -- (-1,6); + + \draw (0,0) -- (0, 5); + \draw (1,0) -- (1, 4); + \draw (2,0) -- (2, 3); + \draw (3,0) -- (3, 2); + \draw (4,0) -- (4, 1); + \draw (-1,0) -- (-1, 6); + \draw (-2,0) -- (-2, 6); + + \draw (-1.5,-0.5) node {\footnotesize{}\texttt{The}}; + \draw (-0.5,-1.0) node {\footnotesize{}\texttt{trainer}}; + \draw ( 0.5,-0.5) node {\footnotesize{}\texttt{trains}}; + \draw ( 1.5,-1.0) node {\footnotesize{}\texttt{the}}; + \draw ( 2.5,-0.5) node {\footnotesize{}\texttt{student}}; + \draw ( 3.5,-1.0) node {\footnotesize{}\texttt{team}}; + + \draw (-1.5,0.5) node {$A$}; + \draw (-0.5,0.5) node {$N$}; + \draw ( 0.5,0.5) node {$N,V$}; + \draw ( 1.5,0.5) node {$A$}; + \draw ( 2.5,0.5) node {$N$}; + \draw ( 3.5,0.5) node {$N,V$}; + + \draw (-1.5,1.5) node {$N$}; + \draw (-0.5,1.5) node {$N$}; + \draw ( 0.5,1.5) node {$$}; + \draw ( 1.5,1.5) node {$N$}; + \draw ( 2.5,1.5) node {$N$}; + + +% \draw (-1.5,1.5) node {\small{}a}; +% \draw (-0.5,1.5) node {\small{}b}; +% \draw ( 0.5,1.5) node {\small{}c}; +% \draw ( 1.5,1.5) node {\small{}d}; +% \draw ( 2.5,1.5) node {\small{}e}; + + \draw (-2.4, 5.5) node {$1$}; + \draw (-2.4, 4.5) node {$2$}; + \draw (-2.4, 3.5) node {$3$}; + \draw (-2.4, 2.5) node {$4$}; + \draw (-2.4, 1.5) node {$5$}; + \draw (-2.4, 0.5) node {$6$}; + \end{tikzpicture} + \end{center} + +\noindent +Now row 4 is in charge of all 3-word parts of the sentence, namely + +\begin{center} +\begin{tabular}{llll} + 4a) & The $\mid$ trainer trains\\ + & The trainer $\mid$ trains\\ + 4b) & trainer $\mid$ trains the\\ + & trainer trains $\mid$ the\\ + 4c) & trains $\mid$ the student\\ + & trains the $\mid$ student\\ + 4d) & the $\mid$ student team\\ + & the student $\mid$ team\\ +\end{tabular} +\end{center} + +\noindent +Note that in case of 3-word parts we have two splits. For example for +4a: $\underbrace{\texttt{The}}_{A}$ and +$\underbrace{\texttt{trainer trains}}_{N}$; and also +$\underbrace{\texttt{The trainer}}_{N}$ and +$\underbrace{\texttt{trains}}_{N,V}$. For each of these splits we have +to look up in the rows below, which non-terminals we already +computed. This allows us to look for right-hand sides in our grammar: +$\meta{A}\cdot\meta{N}$, $\meta{N}\cdot\meta{N}$ and +$\meta{N}\cdot\meta{V}$, which yield the only left-hand side +\meta{N}. This is what we fill in for 4a. And so on for row 4. + +Row 3 is about all 4-word parts in the sentence, namely + +\begin{center} +\begin{tabular}{llll} + 3a) & The trainer trains the\\ + 3b) & trainer trains the student\\ + 3c) & trains the student team\\ +\end{tabular} +\end{center} + +\noindent +Each of them can be split up in 3 ways, for example + +\begin{center} +\begin{tabular}{llll} + 3a) & The $\mid$ trainer trains the\\ + & The trainer $\mid$ trains the\\ + & The trainer trains $\mid$ the\\ +\end{tabular} +\end{center} + +\noindent +and we have to consider them all in turn to fill in the non-terminals for +3a. You can guess how it continues: row 2 is for all 5-word parts, and finally +the field on the top is for the whole sentence. +The idea of the CYK algorithm is that if in the top-field the starting +symbol \meta{S} appears (possibly together with other non-terminals), +then the sentence is accepted by the grammar. If it does not, then the +sentence is not accepted. + +Let us very quickly calculate the complexity of the CYK +algorithm. Lookup operations inside the triangle and in the grammar +are assumed to be of constant time, $O(1)$, meaning they do not +matter. How many fields are in the triangle\ldots +$\frac{n}{2} * (n+1)$, where $n$ is the size of the input. That means +roughly $O(n^2)$ fields. How much work do we have to do for each +field? Well, for the top-most we have to consider $n-1$ splits, which +means roughly $O(n)$ for each field. The overall result is a $O(n^3)$ +time-complexity for CYK. It turns out that this is the best worst-time +complexity for parsing with context-free grammars. \end{document} diff -r ddcb616e036a -r aaf0bd0a211d slides/slides05.pdf Binary file slides/slides05.pdf has changed diff -r ddcb616e036a -r aaf0bd0a211d slides/slides05.tex --- a/slides/slides05.tex Thu Oct 29 00:25:20 2020 +0000 +++ b/slides/slides05.tex Fri Oct 30 01:45:03 2020 +0000 @@ -397,7 +397,7 @@ child {node {$\meta{E}$} child[sibling distance=10mm] {node {$\meta{T}$} child {node {$\meta{F}$} child {node {2}}} - child {node {+}} + child {node {*}} child {node {$\meta{T}$} child {node {$\meta{F}$} child {node {3}}}} } child {node {+}} @@ -475,6 +475,106 @@ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[c] +\frametitle{CYK Algorithm} + +Suppose the grammar: + +\begin{center} +\bl{\begin{tabular}{@ {}lcl@ {}} +$\meta{S}$ & $::=$ & $\meta{N}\cdot \meta{P}$ \\ +$\meta{P}$ & $::=$ & $\meta{V}\cdot \meta{N}$ \\ +$\meta{N}$ & $::=$ & $\meta{N}\cdot \meta{N}$ \\ +$\meta{N}$ & $::=$ & $\texttt{students} \;|\; \texttt{Jeff} \;|\; \texttt{geometry} \;|\; \texttt{trains} $ \\ +$\meta{V}$ & $::=$ & $\texttt{trains}$ +\end{tabular}} +\end{center} + +\bl{\texttt{Jeff trains geometry students}} + +\end{frame} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame}[c] +\frametitle{CYK Algorithm} + +\begin{center} + \begin{tikzpicture}[scale=1,line width=0.8mm] + \draw (-2,0) -- (2,0); + \draw (-2,1) -- (2,1); + \draw (-2,2) -- (1,2); + \draw (-2,3) -- (0,3); + \draw (-2,4) -- (-1,4); + + \draw (0,0) -- (0, 3); + \draw (1,0) -- (1, 2); + \draw (2,0) -- (2, 1); + \draw (-1,0) -- (-1, 4); + \draw (-2,0) -- (-2, 4); + + \draw (-1.5,-0.5) node {\footnotesize{}\texttt{Jeff}}; + \draw (-0.5,-1.0) node {\footnotesize{}\texttt{trains}}; + \draw ( 0.5,-0.5) node {\footnotesize{}\texttt{geometry}}; + \draw ( 1.5,-1.0) node {\footnotesize{}\texttt{students}}; + + \draw (-1.5,0.5) node {$N$}; + \draw (-0.5,0.5) node {$N,V$}; + \draw ( 0.5,0.5) node {$N$}; + \draw ( 1.5,0.5) node {$N$}; + + \draw (-2.4, 3.5) node {$1$}; + \draw (-2.4, 2.5) node {$2$}; + \draw (-2.4, 1.5) node {$3$}; + \draw (-2.4, 0.5) node {$4$}; + \end{tikzpicture} + \end{center} + +\begin{textblock}{5}(10,10) +\small\bl{\begin{tabular}{@ {}lcl@ {}} +$\meta{S}$ & $::=$ & $\meta{N}\cdot \meta{P}$ \\ +$\meta{P}$ & $::=$ & $\meta{V}\cdot \meta{N}$ \\ +$\meta{N}$ & $::=$ & $\meta{N}\cdot \meta{N}$ \\ + $\meta{N}$ & $::=$ & $\texttt{students} \;|\; \texttt{Jeff}$\\ + & & $\;|\; \texttt{geometry} \;|\; \texttt{trains} $ \\ +$\meta{V}$ & $::=$ & $\texttt{trains}$ +\end{tabular}} +\end{textblock} + +\end{frame} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame}[t] +\frametitle{Chomsky Normal Form} + +A grammar for palindromes over the alphabet~\bl{$\{a,b\}$}: + +\bl{\begin{plstx}[margin=0cm] +: \meta{S} ::= a\cdot \meta{S}\cdot a | b\cdot \meta{S}\cdot b | a\cdot a | b\cdot b | a | b \\ +\end{plstx}} + +\end{frame} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame}[c] +\frametitle{CYK Algorithm} + + +\begin{itemize} +\item fastest possible algorithm for recognition problem +\item runtime is \bl{$O(n^3)$}\bigskip +\item grammars need to be transformed into CNF +\end{itemize} + +\end{frame} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame}[c] \frametitle{Context Sensitive Grammars} It is much harder to find out whether a string is parsed @@ -919,6 +1019,15 @@ \end{frame}} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame}[t,fragile] +\begin{mybox3}{} + For CW2, please include '$\backslash$' as a symbol in strings, because + the collatz program contains + \begin{lstlisting}[language=Scala, numbers=none] + write "\n";\end{lstlisting} +\end{mybox3} +\end{frame} + \begin{frame}[t] \begin{mybox3}{} val (r1s, f1s) = simp(r1)\\ diff -r ddcb616e036a -r aaf0bd0a211d style.sty --- a/style.sty Thu Oct 29 00:25:20 2020 +0000 +++ b/style.sty Fri Oct 30 01:45:03 2020 +0000 @@ -5,7 +5,7 @@ \setmainfont[Ligatures=TeX]{Palatino Linotype} \usepackage{amssymb} \usepackage{amsmath} -\usepackage{menukeys} +%\usepackage{menukeys} \definecolor{darkblue}{rgb}{0,0,0.6} \usepackage[colorlinks=true,urlcolor=darkblue,linkcolor=darkblue]{hyperref} \usepackage{soul} @@ -24,31 +24,7 @@ \newcommand{\inj}{\textit{inj}} \newcommand{\nullable}{\textit{nullable}} -%%% for trees -%% http://anorien.csc.warwick.ac.uk/mirrors/CTAN/graphics/pgf/contrib/forest/forest.pdf -\newcommand\grid[1]{% -\begin{tikzpicture}[baseline=(char.base)] - \path[use as bounding box] - (0,0) rectangle (1em,1em); - \draw[red!50, fill=red!20] - (0,0) rectangle (1em,1em); - \node[inner sep=1pt,anchor=base west] - (char) at (0em,\gridraiseamount) {#1}; -\end{tikzpicture}} -\newcommand\gridraiseamount{0.12em} - -\makeatletter -\newcommand\Grid[1]{% - \@tfor\z:=#1\do{\grid{\z}}} -\makeatother - -\newcommand\Vspace[1][.3em]{% - \mbox{\kern.06em\vrule height.3ex}% - \vbox{\hrule width#1}% - \hbox{\vrule height.3ex}} - -\def\VS{\Vspace[0.6em]} %%% url pointers