# HG changeset patch # User Christian Urban # Date 1695386928 -3600 # Node ID 6ad0f63e19686a844c7ec8944a27b3a56e486fde # Parent 437e4f8d35d810ad0e455e13899e491855f681d8 updated diff -r 437e4f8d35d8 -r 6ad0f63e1968 cws/cw04.pdf Binary file cws/cw04.pdf has changed diff -r 437e4f8d35d8 -r 6ad0f63e1968 handouts/amm-ho.pdf Binary file handouts/amm-ho.pdf has changed diff -r 437e4f8d35d8 -r 6ad0f63e1968 handouts/amm-ho.tex --- a/handouts/amm-ho.tex Thu Sep 21 16:32:46 2023 +0100 +++ b/handouts/amm-ho.tex Fri Sep 22 13:48:48 2023 +0100 @@ -17,7 +17,7 @@ language you like, but I will show you all my code using Scala---I hope you have fond memories of Scala from PEP. The only difference with PEP is that I will use the current -stable version of Scala, which at teh time of writing is Scala 3.3.1. +stable version of Scala, which at the time of writing is Scala 3.3.1. \begin{tcolorbox}[colback=red!5!white,colframe=red!75!black] If you intend to submit your code for the CW in Scala, you \underline{MUST} submit code that @@ -72,6 +72,8 @@ \end{lstlisting} %% $ \noindent +This creates a file \code{amm} which before it can be run might +need some adjustments of the permissions. The big advantage of Ammonite is that it comes with some additional libraries already built-in and also allows one to easily break up code into smaller modules. For example reading and writing files in @@ -150,17 +152,14 @@ \noindent Of course this requires that you use \texttt{println} for inspecting any data as otherwise nothing will be displayed at the commandline. -\bigskip +\smallskip \noindent To sum up, Ammonite is a really useful addition to the Scala ecosystem. You can find more information about how to use it in the first five chapters of the ``Hands-on Scala'' book by Li Haoyi. These chapters are -free and can be used as a reference, see: +free and can be used as a reference, see \url{https://www.handsonscala.com/part-i-introduction-to-scala.html} -\begin{center} -\url{https://www.handsonscala.com/part-i-introduction-to-scala.html} -\end{center} \end{document} diff -r 437e4f8d35d8 -r 6ad0f63e1968 handouts/ho01.pdf Binary file handouts/ho01.pdf has changed diff -r 437e4f8d35d8 -r 6ad0f63e1968 handouts/ho01.tex --- a/handouts/ho01.tex Thu Sep 21 16:32:46 2023 +0100 +++ b/handouts/ho01.tex Fri Sep 22 13:48:48 2023 +0100 @@ -42,7 +42,7 @@ \begin{document} -\fnote{\copyright{} Christian Urban, King's College London, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021} +\fnote{\copyright{} Christian Urban, King's College London, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2023} \section*{Handout 1} @@ -52,7 +52,7 @@ written by Grace Hopper.\footnote{Who many years ago was invited on a talk show hosted by David Letterman. \here{https://youtu.be/3N_ywhx6_K0?t=31}} Why studying compilers -nowadays? An interesting answer is given by John Regher in his compiler +nowadays? An interesting answer is given by John Regehr in his compiler blog:\here{http://blog.regehr.org/archives/1419} \begin{quote}\it{}``We can start off with a couple of observations @@ -205,33 +205,10 @@ \end{tabular} \end{center} -\noindent With this table you can figure out the purpose of the regular -expressions in the web-crawlers shown Figures \ref{crawler1} and -\ref{crawler3}. In Figure~\ref{crawler1}, however, be careful with -the regular expression for http-addresses in Line~\ref{httpline}. It is -intended to be - -\[ -\pcode{"https?://[^"]*"} -\] - -\noindent specifying that http-addresses need to start with a double -quote, then comes \texttt{http} followed by an optional \texttt{s} and -so on\ldots{}until the closing double quote comes at the end of the -address. Normally we would have to escape the double quotes in order to -make sure we interpret the double quote as character, not as double -quote for a string. But Scala's trick with triple quotes allows us to -omit this kind of ugly escaping. As a result we can just write: - -\[ -\pcode{""""https?://[^"]*"""".r} -\] - -\noindent The convention in Scala is that \texttt{.r} converts a string -into a regular expression. I leave it to you to ponder whether this -regular expression really captures all possible web-addresses. If you -need a quick recap about regular expressions and how the match strings, -here is a quick video: \url{https://youtu.be/bgBWp9EIlMM}. +The syntax is pretty universal and can be found in many regular +expression libraries. If you need a quick recap about regular +expressions and how the match strings, here is a quick video: +\url{https://youtu.be/bgBWp9EIlMM}. \subsection*{Why Study Regular Expressions?} @@ -243,7 +220,7 @@ Why on earth then is there any interest in studying them again in depth in this module? Well, one answer is in the following two graphs about -regular expression matching in Python, Ruby, JavaScript and Java +regular expression matching in Python, Ruby, JavaScript, Swift and Java (Version 8). \begin{center} @@ -264,12 +241,13 @@ axis lines=left, width=5.5cm, height=4.5cm, - legend entries={Python, Java 8, JavaScript}, + legend entries={Python, Java 8, JavaScript, Swift}, legend pos=north west, legend cell align=left] \addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data}; \addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data}; \addplot[red,mark=*, mark options={fill=white}] table {re-js.data}; +\addplot[magenta,mark=*, mark options={fill=white}] table {re-swift.data}; \end{axis} \end{tikzpicture} & @@ -299,7 +277,7 @@ \end{tabular} \end{center} -\noindent The first graph shows that Python, JavaScript and Java 8 need +\noindent The first graph shows that Python, JavaScript, Swift and Java 8 need approximately 30 seconds to find out that the regular expression $\texttt{(a*)*\,b}$ does not match strings of 28 \texttt{a}s. Similarly, the second shows that Python and Ruby need approximately 29 seconds for finding @@ -338,7 +316,7 @@ relatively simple regular expression \begin{center} -\url{https://www.tutorialdocs.com/article/regex-trap.html} +\url{https://archive.ph/W5Ogx#selection-141.1-141.36} \end{center} \noindent @@ -696,18 +674,22 @@ expressions, not needing any automata (you will see we only touch briefly on automata in lecture 3). Automata are usually the main object of study in formal language courses. The avoidance of automata -is crucial for me because automata are graphs and it is rather difficult to -reason about graphs in theorem provers. In contrast, reasoning about -regular expressions is easy-peasy in theorem provers. Is this -important? I think yes, because according to Kuklewicz nearly all -POSIX-based regular expression matchers are +is crucial for me because automata are graphs and it is rather +difficult to reason about graphs in theorem provers. In contrast, +reasoning about regular expressions is easy-peasy in theorem +provers. Is this important? I think yes, because according to +Kuklewicz nearly all POSIX-based regular expression matchers are buggy.\footnote{\url{http://www.haskell.org/haskellwiki/Regex_Posix}} -With my PhD student Fahad Ausaf I proved the correctness for one such -matcher that was proposed by Sulzmann and Lu in -2014.\footnote{\url{http://goo.gl/bz0eHp}} Hopefully we can prove that -the machine code(!) that implements this code efficiently is correct -also. Writing programs in this way does not leave any room for -potential errors or bugs. How nice! +With my PhD students Fahad Ausaf and Chengsong Tan, I proved the +correctness for two such matchers that were proposed by Sulzmann and Lu +in 2014.\footnote{\url{http://goo.gl/bz0eHp}} A variant of which you have +already seen in PEP as CW3 and you will see again in the CFL in the first +two CWs. What we have not yet figured out that our matchers are +universally fast, meaning they do not explode on any input. +Hopefully we can also prove +that the machine code(!) that implements our matchers efficiently is +correct also. Writing programs in this way does not leave any room for +any errors or bugs. How nice! What also helped with my fascination with regular expressions is that we could indeed find out new things about them that @@ -721,14 +703,14 @@ shown via automata. So even somebody who has written a 700+-page book\footnote{\url{http://goo.gl/fD0eHx}} on regular expressions did not know better. Well, we showed it can also be -done with regular expressions only.\footnote{\url{http://nms.kcl.ac.uk/christian.urban/Publications/posix.pdf}} +done with regular expressions only.\footnote{\url{https://nms.kcl.ac.uk/christian.urban/Publications/rexp.pdf}} What a feeling when you are an outsider to the subject! To conclude: Despite my early ignorance about regular expressions, I find them now extremely interesting. They have practical importance (remember the shocking runtime of the regular expression matchers in -Python, Ruby and Java in some instances and the problems in Stack -Exchange and the Atom editor). They are used in tools like Snort and +Python, Ruby, Swift and Java in some instances and the problems in Stack +Exchange and the Atom editor---even regex libraries in more modern programming languages, like Rust, have their problems). They are used in tools like Snort and Zeek in order to monitor network traffic. They have a beautiful mathematical theory behind them, which can be sometimes quite deep and which sometimes contains hidden snares. People who are not very familiar @@ -791,18 +773,18 @@ \end{quote} -\begin{figure}[p]\small - \lstinputlisting[numbers=left,linebackgroundcolor={\ifodd\value{lstnumber}\color{capri!3}\fi}] - {../progs/crawler1.scala} - -\caption{The Scala code for a simple web-crawler that checks -for broken links in web-pages. It uses the regular expression -\texttt{http\_pattern} in Line~\ref{httpline} for recognising -URL-addresses. It finds all links using the library function -\texttt{findAllIn} in Line~\ref{findallline} (this function -is part of Scala's regular expression library).\label{crawler1}} - -\end{figure} +%\begin{figure}[p]\small +% \lstinputlisting[numbers=left,linebackgroundcolor={\ifodd\value{lstnumber}\color{capri!3}\fi}] +% {../progs/crawler1.scala} +% +%\caption{The Scala code for a simple web-crawler that checks +%for broken links in web-pages. It uses the regular expression +%\texttt{http\_pattern} in Line~\ref{httpline} for recognising +%URL-addresses. It finds all links using the library function +%\texttt{findAllIn} in Line~\ref{findallline} (this function +%is part of Scala's regular expression library).\label{crawler1}} +% +%\end{figure} @@ -820,18 +802,18 @@ %not.\label{crawler2}} %\end{figure} -\begin{figure}[p]\small - \lstinputlisting[numbers=left,linebackgroundcolor={\ifodd\value{lstnumber}\color{capri!3}\fi}] - {../progs/crawler2.scala} - -\caption{A small email harvester---whenever we download a -web-page, we also check whether it contains any email -addresses. For this we use the regular expression -\texttt{email\_pattern} in Line~\ref{emailline}. The main -change is in Line~\ref{mainline} where all email addresses -that can be found in a page are printed.\label{crawler3}} - -\end{figure} +%\begin{figure}[p]\small +% \lstinputlisting[numbers=left,linebackgroundcolor={\ifodd\value{lstnumber}\co%lor{capri!3}\fi}] +% {../progs/crawler2.scala} +% +%\caption{A small email harvester---whenever we download a +%web-page, we also check whether it contains any email +%addresses. For this we use the regular expression +%\texttt{email\_pattern} in Line~\ref{emailline}. The main +%change is in Line~\ref{mainline} where all email addresses +%that can be found in a page are printed.\label{crawler3}} +% +%\end{figure} \begin{figure}[p] \tiny