--- a/progs/scala/autos.scala Fri Jan 12 02:33:35 2018 +0000
+++ b/progs/scala/autos.scala Tue May 15 10:24:25 2018 +0100
@@ -383,7 +383,7 @@
*/
-// Regular expressions fro derivative automata
+// Regular expressions for derivative automata
abstract class Rexp
case object ZERO extends Rexp
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/text/langs.sty Tue May 15 10:24:25 2018 +0100
@@ -0,0 +1,76 @@
+\usepackage{listings}
+\usepackage{etoolbox}
+\setmonofont[Scale=.95]{Consolas}
+\newfontfamily{\consolas}{Consolas}
+
+\definecolor{codered}{rgb}{0.6,0,0} % for strings
+\definecolor{codegreen}{rgb}{0.25,0.5,0.35} % comments
+\definecolor{codepurple}{rgb}{0.5,0,0.35} % keywords
+\definecolor{codedocblue}{rgb}{0.25,0.35,0.75} % doc
+\definecolor{codeblue}{rgb}{0.25,0.35,0.75} % types
+
+\BeforeBeginEnvironment{lstlisting}{\par\noindent\begin{minipage}{\linewidth}}
+\AfterEndEnvironment{lstlisting}{\end{minipage}\par}
+
+\lstdefinelanguage{Scala}{
+ morekeywords={abstract,case,catch,class,def,%
+ do,else,extends,false,final,finally,%
+ for,if,implicit,import,match,mixin,%
+ new,null,object,override,package,%
+ private,protected,requires,return,sealed,%
+ super,this,throw,trait,true,try,%
+ type,val,var,while,with,yield,write,read},%
+ otherkeywords={=>,<-,<\%,<:,>:,\#},%
+ sensitive=true,%
+ %directives={Int,Char,Rexp,String,Boolean,BigInt,Unit,List,Set},%
+ %moredelim=*[directive]:,%
+ morecomment=[l]{//},%
+ morecomment=[n]{/*}{*/},
+ morestring=[s]{"""}{"""},
+ morestring=[b]",
+ morestring=[b]',
+}[keywords,comments,strings]
+
+\lstdefinelanguage{While}{
+ morekeywords={if,then,else,while,do,true,false,write,upto,read,for,skip},
+ morecomment=[l]{//},
+ morecomment=[n]{/*}{*/},
+ morestring=[b]",
+ otherkeywords={=,!=,:=,<,>,\%;*,/},
+}[keywords,comments,strings]
+
+
+\newcommand{\code}[1]{{\lstinline{#1}}}
+\newcommand{\pcode}[1]{\mbox{\lstset{language={},keywordstyle=\color{black}}\lstinline!#1!}}
+\newcommand{\scode}[1]{\mbox{\lstset{language={},basicstyle=\ttfamily\color{codegreen}}\lstinline!#1!}}
+\makeatother
+
+%%\lstset{escapeinside={(*@}{@*)}}
+\lstset{escapeinside={/*@}{@*/}}
+
+%% stripy code
+\usepackage{lstlinebgrd}
+\definecolor{capri}{rgb}{0.0, 0.75, 1.0}
+
+
+\lstdefinestyle{mystyle}
+ {basicstyle=\ttfamily,
+ keywordstyle=\color{codepurple}\bfseries,
+ stringstyle=\color{codegreen},
+ commentstyle=\color{codegreen},
+ morecomment=[s][\color{codedocblue}]{/**}{*/},
+ numbers=none,
+ numberstyle=\tiny\color{black},
+ stepnumber=1,
+ numbersep=10pt,
+ tabsize=2,
+ showspaces=false,
+ showstringspaces=false,
+ xleftmargin=8mm,
+ emphstyle=\color{codeblue}\bfseries,
+ keepspaces,
+ linebackgroundcolor={\ifodd\value{lstnumber}\color{capri!3}\fi}
+}
+
+\lstset{language=Scala,
+ style=mystyle}
Binary file text/proposal.pdf has changed
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/text/proposal.tex Tue May 15 10:24:25 2018 +0100
@@ -0,0 +1,429 @@
+\documentclass{article}
+\usepackage{url}
+\usepackage{color}\definecolor{darkblue}{rgb}{0,0,0.5}
+\usepackage[a4paper,colorlinks=true,linkcolor=darkblue,citecolor=darkblue,filecolor=darkblue,pagecolor=darkblue,urlcolor=darkblue]{hyperref}
+\usepackage{style}
+\usepackage{langs}
+\usepackage{tikz}
+\usepackage{pgf}
+\usepackage{pgfplots}
+\usepackage{stackengine}
+
+\addtolength{\oddsidemargin}{-6mm}
+\addtolength{\evensidemargin}{-6mm}
+\addtolength{\textwidth}{12mm}
+
+
+
+%% \usepackage{accents}
+\newcommand\barbelow[1]{\stackunder[1.2pt]{#1}{\raisebox{-4mm}{\boldmath$\uparrow$}}}
+
+\begin{filecontents}{re-python2.data}
+1 0.033
+5 0.036
+10 0.034
+15 0.036
+18 0.059
+19 0.084
+20 0.141
+21 0.248
+22 0.485
+23 0.878
+24 1.71
+25 3.40
+26 7.08
+27 14.12
+28 26.69
+\end{filecontents}
+
+\begin{filecontents}{re-java.data}
+5 0.00298
+10 0.00418
+15 0.00996
+16 0.01710
+17 0.03492
+18 0.03303
+19 0.05084
+20 0.10177
+21 0.19960
+22 0.41159
+23 0.82234
+24 1.70251
+25 3.36112
+26 6.63998
+27 13.35120
+28 29.81185
+\end{filecontents}
+
+\begin{filecontents}{re-java9.data}
+1000 0.01410
+2000 0.04882
+3000 0.10609
+4000 0.17456
+5000 0.27530
+6000 0.41116
+7000 0.53741
+8000 0.70261
+9000 0.93981
+10000 0.97419
+11000 1.28697
+12000 1.51387
+14000 2.07079
+16000 2.69846
+20000 4.41823
+24000 6.46077
+26000 7.64373
+30000 9.99446
+34000 12.966885
+38000 16.281621
+42000 19.180228
+46000 21.984721
+50000 26.950203
+60000 43.0327746
+\end{filecontents}
+
+
+\begin{document}
+
+
+\section*{Proposal:\\
+ Fast Regular Expression Matching\\
+ with Brzozowski Derivatives}
+
+\subsubsection*{Summary}
+
+If you want to connect a computer directly to the Internet, it must be
+immediately \mbox{hardened} against outside attacks. The current
+technology for this is to use regular expressions in order to
+automatically scan all incoming network traffic for signs when a
+computer is under attack and if found, to take appropriate
+counter-actions. One possible action could be to slow down the traffic
+from sites where repeated password-guesses originate. Well-known
+network intrusion prevention systems that use regular expressions for
+traffic analysis are Snort and Bro.
+
+Given the large
+volume of Internet traffic even the smallest servers can handle
+nowadays, the regular expressions for traffic analysis have become a
+real bottleneck. This is not just a nuisance, but actually a security
+vulnerability in itself: it can lead to denial-of-service attacks.
+The proposed project aims to remove this bottleneck by using a little-known
+technique of building derivatives of regular expressions. These
+derivatives have not yet been used in the area of network traffic
+analysis, but have the potential to solve some tenacious problems with
+existing approaches. The project will require theoretical work, but
+also a practical implementation (a proof-of-concept at least). The
+work will build on earlier work by Ausaf and Urban from King's College
+London \cite{AusafDyckhoffUrban2016}.
+
+
+\subsubsection*{Background}
+
+If every 10 minutes a thief turned up at your car and rattled
+violently at the doorhandles to see if he could get in, you would move
+your car to a safer neighbourhood. Computers, however, need to stay
+connected to the Internet all the time and there they have to withstand such
+attacks. A rather typical instance of an attack can be seen in the
+following log entries from a server at King's:
+
+{\small
+\begin{verbatim}
+ Jan 2 00:53:19 talisker sshd: Received disconnect from 110.53.183.227: [preauth]
+ Jan 2 00:58:31 talisker sshd: Received disconnect from 110.53.183.252: [preauth]
+ Jan 2 01:01:28 talisker sshd: Received disconnect from 221.194.47.236: [preauth]
+ Jan 2 01:03:59 talisker sshd: Received disconnect from 110.53.183.228: [preauth]
+ Jan 2 01:06:53 talisker sshd: Received disconnect from 221.194.47.245: [preauth]
+ ...
+\end{verbatim}}
+
+
+\noindent
+This is a record of the network activities from the server
+\href{http://talisker.inf.kcl.ac.uk/cgi-bin/repos.cgi}{talisker.inf.kcl.ac.uk},
+which hosts lecture material for students. The log indicates several
+unsuccessful login attempts via the \texttt{ssh} program from
+computers with the addresses \texttt{110.53.183.227} and so on. This
+is a clear sign of a \emph{brute-force attack} where hackers
+systematically try out password candidates. Such attacks are blunt,
+but unfortunately very powerful. They can originate from anywhere in
+the World; they are automated and often conducted from computers that
+have been previously hijacked. Once the attacker ``hits'' on the
+correct password, then he or she gets access to the server. The only
+way to prevent this methodical password-guessing is to spot the
+corresponding traces in the log and then slow down the traffic from
+such sites in a way that perhaps only one password per hour can be
+tried out. This does not affect ``normal'' operations of the server,
+but drastically decreases the chances of hitting the correct password
+by a brute-force attack.
+
+
+Server administrators use \emph{regular expressions} for scanning log
+files. The purpose of these expressions is to specify patterns of
+interest (for example \texttt{Received disconnect from 110.53.183.227}
+where the address needs to be variable). A program will
+then continuously scan for such patterns and trigger an action if a
+pattern is found. Popular examples of such programs are Snort and Bro
+\cite{snort,bro}. Clearly, there are also other kinds of
+vulnerabilities hackers try to take advantage of---mainly programming
+mistakes that can be abused to gain access to a computer. This means
+server administrators have a suite of sometimes thousands of regular
+expressions, all prescribing some suspicious pattern for when a
+computer is under attack.
+
+
+The underlying algorithmic problem is to decide whether a string in
+the logs matches the patterns determined by the regular
+expressions. Such decisions need to be done as fast as possible,
+otherwise the scanning programs would not be useful as a hardening
+technique for servers. The quest for speed with these decisions is
+presently a rather big challenge for the amount of traffic typical
+serves have to cope with and the large number of patterns that might
+be of interest. The problem is that when thousands of regular
+expressions are involved, the process of detecting whether a string
+matches a regular expression can be excruciating slow. This might not
+happen in most instances, but in some small number of
+instances. However in these small number of instances the algorithm
+for regular expression matching can grind to a halt. This phenomenon
+is called \emph{catastrophic backtracking} \cite{catast}.
+Unfortunately, catastrophic backtracking is not just a nuisance, but a
+security vulnerability in itself. The reason is that attackers look
+for these instances where the scan is very slow and then trigger a
+\emph{denial-of-service attack} against the server\ldots meaning the
+server will not be reachable anymore to normal users.
+
+Existing approaches try to mitigate the speed problem with regular
+expressions by implementing the matching algorithms in hardware,
+rather than in software \cite{hardware}. Although this solution offers
+speed, it is often unappealing because attack patterns can change or
+need to be augmented. A software solution is clearly more desirable in
+these circumstances. Also given that regular expressions were
+introduced in 1950 by Kleene, one might think regular expressions have
+since been studied and implemented to death. But this would definitely
+be a mistake: in fact they are still an active research area and
+off-the-shelf implementations are still extremely prone to the
+problem of catastrophic backtracking. This can be seen in the
+following two graphs:
+
+\begin{center}
+\begin{tabular}{@{}cc@{}}
+%\multicolumn{2}{c}{Graph: $(a^*)^*\cdot b$ and strings
+% $\underbrace{a\ldots a}_{n}$}\smallskip\\
+\begin{tikzpicture}
+\begin{axis}[
+ xlabel={$n$},
+ x label style={at={(1.05,0.0)}},
+ ylabel={time in secs},
+ y label style={at={(0.06,0.5)}},
+ enlargelimits=false,
+ xtick={0,5,...,30},
+ xmax=33,
+ ymax=45,
+ ytick={0,10,...,40},
+ scaled ticks=false,
+ axis lines=left,
+ width=6cm,
+ height=4.5cm,
+ legend entries={Python, Java 8},
+ legend pos=north west]
+\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
+\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
+\end{axis}
+\end{tikzpicture}
+ &
+\begin{tikzpicture}
+\begin{axis}[
+ xlabel={$n$},
+ x label style={at={(1.05,0.0)}},
+ ylabel={time in secs},
+ y label style={at={(0.06,0.5)}},
+ %enlargelimits=false,
+ %xtick={0,5000,...,30000},
+ xmax=65000,
+ ymax=45,
+ ytick={0,10,...,40},
+ scaled ticks=false,
+ axis lines=left,
+ width=6cm,
+ height=4.5cm,
+ legend entries={Java 9},
+ legend pos=north west]
+\addplot[cyan,mark=*, mark options={fill=white}] table {re-java9.data};
+\end{axis}
+\end{tikzpicture}
+\end{tabular}
+\end{center}
+
+\noindent
+These graphs show how long strings can be when using the rather simple
+regular expression $(a^*)^* \,b$ and the built-in regular expression
+matchers in the popular programming languages Python and Java (Version
+8 and Version 9). There are many more regular expressions that show
+the same bahaviour. On the left-hand side the graphs show that for a
+string consisting of just 28 $a\,$'s, Python and the older Java (which
+was the latest version of Java until September 2017) already need 30
+seconds to decide whether this string is matched or not. This is an
+abysmal result given that Python and Java are very popular programming
+languages. We already know that this problem can be decided much, much
+faster. If these slow regular expression matchers would be employed
+in network traffic analysis, then this example is already an
+opportunity for mounting an easy denial-of-service attack: one just
+has to send to the server a large enough string of a's. The newer
+version of Java is better---it can match strings of around 50,000
+$a\,$'s in 30 seconds---however this would still not be good enough
+for being a useful tool for network traffic analysis. What is
+interesting is that even a relatively simple-minded regular expression
+matcher based on Brzozowski derivatives can out-compete the regular
+expressions matchers in Java and Python on such catastrophic
+backtracking examples. This gain in speed is the motivation and
+starting point for the proposed project. \smallskip
+
+\subsection*{Research Plan}
+
+Regular expressions are already an old and well-studied topic in
+Computer Science. They were originally introduced by Kleene in 1951
+and are utilised in text search algorithms for ``find'' or ``find and
+replace'' operations \cite{Brian,Kleene51}. Because of their wide
+range of applications, many programming languages provide capabilities
+for regular expression matching via libraries. The classic theory
+of regular expressions involves just 6 different kinds of regular
+expressions, often defined in terms of the following grammar:
+
+\begin{center}\small
+\begin{tabular}{lcll}
+ $r$ & $::=$ & $\ZERO$ & cannot match anything\\
+ & $|$ & $\ONE$ & can only match the empty string\\
+ & $|$ & $c$ & can match a single character (in this case $c$)\\
+ & $|$ & $r_1 + r_2$ & can match a string either with $r_1$ or with $r_2$\\
+ & $|$ & $r_1\cdot r_2$ & can match the first part of a string with $r_1$ and\\
+ & & & then the second part with $r_2$\\
+ & $|$ & $r^*$ & can match zero or more times $r$\\
+\end{tabular}
+\end{center}
+
+\noindent
+For practical purposes, however, regular expressions have been
+extended in various ways in order to make them even more powerful and
+even more useful for applications. Some of the problems to do with
+catastrophic backtracking stem, unfortunately, from these extensions.
+
+The crux in this project is to not use automata for regular expression
+matching (the traditional technique), but to use derivatives of
+regular expressions instead. These derivatives were introduced by
+Brzozowski in 1964 \cite{Brzozowski1964}, but somehow had been lost
+``in the sands of time'' \cite{Owens2009}. However, they recently have
+become again a ``hot'' research topic with numerous research papers
+appearing in the last five years. One reason for this interest is that
+Brzozowski derivatives straightforwardly extend to more general
+regular expression constructors, which cannot be easily treated with
+standard techniques. They can also be implemented with ease in any
+functional programming language: the definition for derivatives
+consists of just two simple recursive functions shown in
+Figure~\ref{f}. Moreover, it can be easily shown (by a mathematical
+proof) that the resulting regular matcher is in fact always producing
+a correct answer. This is an area where Urban can provide a lot of
+expertise for this project: he is one of the main developers of the
+Isabelle theorem prover, which is a program designed for such proofs
+\cite{Isabelle}.
+
+
+\begin{figure}[t]
+\begin{center}\small
+\begin{tabular}{lcl}
+$\textit{nullable}(\ZERO)$ & $\dn$ & $\textit{false}$\\
+$\textit{nullable}(\ONE)$ & $\dn$ & $\textit{true}$\\
+$\textit{nullable}(c)$ & $\dn$ & $\textit{false}$\\
+$\textit{nullable}(r_1 + r_2)$ & $\dn$ & $\textit{nullable}(r_1) \vee \textit{nullable}(r_2)$\\
+$\textit{nullable}(r_1 \cdot r_2)$ & $\dn$ & $\textit{nullable}(r_1) \wedge \textit{nullable}(r_2)$\\
+$\textit{nullable}(r^*)$ & $\dn$ & $\textit{true}$\medskip\\
+
+$\textit{der}\;c\;(\ZERO)$ & $\dn$ & $\ZERO$\\
+$\textit{der}\;c\;(\ONE)$ & $\dn$ & $\ZERO$\\
+$\textit{der}\;c\;(d)$ & $\dn$ & $\textit{if}\; c = d\;\textit{then} \;\ONE \; \textit{else} \;\ZERO$\\
+$\textit{der}\;c\;(r_1 + r_2)$ & $\dn$ & $(\textit{der}\;c\;r_1) + (\textit{der}\;c\;r_2)$\\
+$\textit{der}\;c\;(r_1 \cdot r_2)$ & $\dn$ & $\textit{if}\;\textit{nullable}(r_1)$\\
+ & & $\textit{then}\;((\textit{der}\;c\;r_1)\cdot r_2) + (\textit{der}\;c\;r_2)$\\
+ & & $\textit{else}\;(\textit{der}\;c\;r_1)\cdot r_2$\\
+$\textit{der}\;c\;(r^*)$ & $\dn$ & $(\textit{der}\;c\;r)\cdot (r^*)$\\[-5mm]
+\end{tabular}
+\end{center}
+\caption{The complete definition of derivatives of regular expressions
+ \cite{Brzozowski1964}.
+ This definition can be implemented with ease in functional programming languages and can be easily extended to regular expressions used in practice.
+ The are more flexible for applications and easier to manipulate in
+ mathematical proofs, than traditional techniques for regular expression
+ matching.\medskip\label{f}}
+\end{figure}
+
+There are a number of avenues for research on Brzozowski derivatives.
+One problem I like to immediately tackle in this project is how to
+handle \emph{back-references} in regular expressions. It is known that
+the inclusion of back-references causes the underlying algorithmic
+problem to become NP-hard \cite{Aho}, and is the main reason why
+regular expression matchers suffer from the catastrophic backtracking
+problem. However, the full generality of back-references are not
+needed in practical applications. The goal therefore is to sufficiently
+restrict the problem so that an efficient algorithm can be
+designed. There seem to be good indications that Brzozowski
+derivatives are useful for that.
+
+Another problem is \emph{how} regular expressions match a string
+\cite{Sulzmann2014}. In this case the algorithm does not just give a
+yes/no answer about whether the regular expression matches the string,
+but also generates a value that encodes which part of the string is
+matched by which part of the regular expression. This is important in
+applications, like network analysis where from a matching log entry a
+specific computer address needs to be extracted. Also compilers make
+extensive use of such an extension when they lex their source
+programs, i.e.~break up code into word-like components. Again Ausaf
+and Urban from King's made some initial progress about proving the
+correctness of such an extension, but their algorithm is not yet fast
+enough for practical purposes \cite{AusafDyckhoffUrban2016}. It would
+be desirable to study optimisations that make the algorithm faster,
+but preserve the correctness guaranties obtained by Ausaf and Urban.
+
+\mbox{}\\[-11mm]
+
+\subsubsection*{Conclusion}
+
+Much research has already gone into regular expressions, and one might
+think they have been studied and implemented to death. But this is far
+from the truth. In fact regular expressions have become a ``hot''
+research topic in recent years because of problems with the
+traditional methods (in applications such as network traffic
+analysis), but also because the technique of derivatives of regular
+expression has been re-discovered. These derivatives provide an
+alternative approach to regular expression matching. Their potential
+lies in the fact that they are more flexible in implementations and
+much easier to manipulate in mathematical proofs. Therefore I like to
+research them in this project.\medskip
+
+\noindent {\bf Impact:} Regular expression matching is a core
+technology in Computer Science with numerous applications. I will
+focus on the area of network traffic analysis and also lexical
+analysis. In these areas this project can have a quite large impact:
+for example the program Bro has been downloaded at least 10,000 times and Snort
+even has 5 million downloads and over 600,000 registered users
+\cite{snort,bro}. Lexical analysis is a core component in every
+compiler, which are the bedrock on which nearly all programming is
+built on.
+
+% {\small
+% \begin{itemize}
+% \item[$\bullet$] \url{http://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}
+% \item[$\bullet$] \url{https://vimeo.com/112065252}
+% \item[$\bullet$] \url{http://davidvgalbraith.com/how-i-fixed-atom/}
+% \end{itemize}}
+\mbox{}\\[-11mm]
+
+\small
+\bibliographystyle{plain}
+\renewcommand{\refname}{References\\[-7mm]\mbox{}}
+\bibliography{proposal}
+
+\end{document}
+
+
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: t
+%%% End:
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/text/style.sty Tue May 15 10:24:25 2018 +0100
@@ -0,0 +1,79 @@
+\usepackage{xcolor}
+%%\usepackage{fontspec}
+\usepackage[sc]{mathpazo}
+\usepackage{fontspec}
+\setmainfont[Ligatures=TeX]{Palatino Linotype}
+\usepackage{amssymb}
+\usepackage{amsmath}
+\usepackage{menukeys}
+\definecolor{darkblue}{rgb}{0,0,0.6}
+\usepackage[colorlinks=true,urlcolor=darkblue,linkcolor=darkblue]{hyperref}
+
+%%% for regular expressions and values
+\newcommand{\ZERO}{\mbox{\bf 0}}
+\newcommand{\ONE}{\mbox{\bf 1}}
+\newcommand{\Left}{\textit{Left}}
+\newcommand{\Der}{\textit{Der}}
+\newcommand{\der}{\textit{der}}
+\newcommand{\Ders}{\textit{Ders}}
+\newcommand{\ders}{\textit{ders}}
+
+%%% for trees
+%% http://anorien.csc.warwick.ac.uk/mirrors/CTAN/graphics/pgf/contrib/forest/forest.pdf
+
+\newcommand\grid[1]{%
+\begin{tikzpicture}[baseline=(char.base)]
+ \path[use as bounding box]
+ (0,0) rectangle (1em,1em);
+ \draw[red!50, fill=red!20]
+ (0,0) rectangle (1em,1em);
+ \node[inner sep=1pt,anchor=base west]
+ (char) at (0em,\gridraiseamount) {#1};
+\end{tikzpicture}}
+\newcommand\gridraiseamount{0.12em}
+
+\makeatletter
+\newcommand\Grid[1]{%
+ \@tfor\z:=#1\do{\grid{\z}}}
+\makeatother
+
+\newcommand\Vspace[1][.3em]{%
+ \mbox{\kern.06em\vrule height.3ex}%
+ \vbox{\hrule width#1}%
+ \hbox{\vrule height.3ex}}
+
+\def\VS{\Vspace[0.6em]}
+
+
+\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}
+\newcommand{\defn}[1]{\textit{\textbf{#1}}}
+\newcommand{\dq}[1]{\mbox{\tt{"}}#1\mbox{\tt{"}}}
+
+\definecolor{codegray}{gray}{0.9}
+
+\makeatletter
+\def\fnote{\gdef\@thefnmark{}\@footnotetext}
+\makeatother
+
+\newcommand{\HEADER}{{\bf Please submit your solutions via email. Please submit
+only ASCII text or PDFs. Every solution should be preceeded by the corresponding
+question text, like:
+
+\begin{center}
+\begin{tabular}{ll}
+Q$n$: & \ldots a difficult question from me\ldots\\
+A: & \ldots an answer from you \ldots\\
+Q$n+1$ & \ldots another difficult question\ldots\\
+A: & \ldots another brilliant answer from you\ldots
+\end{tabular}
+\end{center}
+
+\noindent Solutions will only be accepted until 20th December! Please send only
+one homework per email.}\bigskip}
+
+\newcommand{\POSTSCRIPT}{
+{\bf (Optional)} This question is for you to provide
+regular feedback to me: for example
+what were the most interesting, least interesting, or confusing
+parts in this lecture? Any problems with my Scala code? Please
+feel free to share any other questions or concerns.}
--- a/thys/Exercises.thy Fri Jan 12 02:33:35 2018 +0000
+++ b/thys/Exercises.thy Tue May 15 10:24:25 2018 +0100
@@ -30,6 +30,8 @@
zeroable r1 \<or> zeroable r2 \<or> (atmostempty r1 \<and> atmostempty r2)"
| "atmostempty (STAR r) = atmostempty r"
+
+
fun
somechars :: "rexp \<Rightarrow> bool"
where
@@ -50,8 +52,7 @@
apply blast
apply(auto)
using Star_cstring
-by (metis concat_eq_Nil_conv)
-
+ by (metis concat_eq_Nil_conv)
lemma atmostempty_correctness_aux:
shows "atmostempty r \<longleftrightarrow> \<not> somechars r"
@@ -65,6 +66,33 @@
by(auto simp add: atmostempty_correctness_aux somechars_correctness)
fun
+ leastsinglechar :: "rexp \<Rightarrow> bool"
+where
+ "leastsinglechar (ZERO) \<longleftrightarrow> False"
+| "leastsinglechar (ONE) \<longleftrightarrow> False"
+| "leastsinglechar (CHAR c) \<longleftrightarrow> True"
+| "leastsinglechar (ALT r1 r2) \<longleftrightarrow> leastsinglechar r1 \<or> leastsinglechar r2"
+| "leastsinglechar (SEQ r1 r2) \<longleftrightarrow>
+ (if (zeroable r1 \<or> zeroable r2) then False
+ else ((nullable r1 \<and> leastsinglechar r2) \<or> (nullable r2 \<and> leastsinglechar r1)))"
+| "leastsinglechar (STAR r) \<longleftrightarrow> leastsinglechar r"
+
+lemma leastsinglechar_correctness:
+ "leastsinglechar r \<longleftrightarrow> (\<exists>c. [c] \<in> L r)"
+ apply(induct r)
+ apply(simp)
+ apply(simp)
+ apply(simp)
+ prefer 2
+ apply(simp)
+ apply(blast)
+ prefer 2
+ apply(simp)
+ using Star.step Star_decomp apply fastforce
+ apply(simp add: Sequ_def zeroable_correctness nullable_correctness)
+ by (metis append_Nil append_is_Nil_conv butlast_append butlast_snoc)
+
+fun
infinitestrings :: "rexp \<Rightarrow> bool"
where
"infinitestrings (ZERO) = False"
--- a/thys/Simplifying.thy Fri Jan 12 02:33:35 2018 +0000
+++ b/thys/Simplifying.thy Tue May 15 10:24:25 2018 +0100
@@ -32,6 +32,7 @@
| "simp_ALT (r\<^sub>1, f\<^sub>1) (ZERO, f\<^sub>2) = (r\<^sub>1, F_LEFT f\<^sub>1)"
| "simp_ALT (r\<^sub>1, f\<^sub>1) (r\<^sub>2, f\<^sub>2) = (ALT r\<^sub>1 r\<^sub>2, F_ALT f\<^sub>1 f\<^sub>2)"
+(* where is ZERO *)
fun simp_SEQ where
"simp_SEQ (ONE, f\<^sub>1) (r\<^sub>2, f\<^sub>2) = (r\<^sub>2, F_SEQ1 f\<^sub>1 f\<^sub>2)"
| "simp_SEQ (r\<^sub>1, f\<^sub>1) (ONE, f\<^sub>2) = (r\<^sub>1, F_SEQ2 f\<^sub>1 f\<^sub>2)"
@@ -227,6 +228,144 @@
ultimately show "slexer r (c # s) = lexer r (c # s)"
by (simp del: slexer.simps add: slexer_better_simp)
qed
-qed
+ qed
+
+
+fun simp2_ALT where
+ "simp2_ALT ZERO r2 seen = (r2, seen)"
+| "simp2_ALT r1 ZERO seen = (r1, seen)"
+| "simp2_ALT r1 r2 seen = (ALT r1 r2, seen)"
+
+fun simp2_SEQ where
+ "simp2_SEQ ZERO r2 seen = (ZERO, seen)"
+| "simp2_SEQ r1 ZERO seen = (ZERO, seen)"
+| "simp2_SEQ ONE r2 seen = (r2, seen \<union> {r2})"
+| "simp2_SEQ r1 ONE seen = (r1, seen \<union> {r1})"
+| "simp2_SEQ r1 r2 seen = (SEQ r1 r2, seen \<union> {SEQ r1 r2})"
+
+
+
+
+fun
+ simp2 :: "rexp \<Rightarrow> rexp set \<Rightarrow> rexp * rexp set"
+where
+ "simp2 (ALT r1 r2) seen =
+ (let (r1s, seen1) = simp2 r1 seen in
+ let (r2s, seen2) = simp2 r2 seen1
+ in simp2_ALT r1s r2s seen2)"
+| "simp2 (SEQ r1 r2) seen =
+ (let (r1s, _) = simp2 r1 {} in
+ let (r2s, _) = simp2 r2 {}
+ in if (SEQ r1s r2s \<in> seen) then (ZERO, seen)
+ else simp2_SEQ r1s r2s seen)"
+| "simp2 r seen = (if r \<in> seen then (ZERO, seen) else (r, seen \<union> {r}))"
+
+
+lemma simp2_ALT[simp]:
+ shows "L (fst (simp2_ALT r1 r2 seen)) = L r1 \<union> L r2"
+ apply(induct r1 r2 seen rule: simp2_ALT.induct)
+ apply(simp_all)
+ done
+
+lemma test1:
+ shows "snd (simp2_ALT r1 r2 rs) = rs"
+ apply(induct r1 r2 rs rule: simp2_ALT.induct)
+ apply(auto)
+ done
+
+lemma test2:
+ shows "rs \<subseteq> snd (simp2_SEQ r1 r2 rs)"
+ apply(induct r1 r2 rs rule: simp2_SEQ.induct)
+ apply(auto)
+ done
+
+lemma test3:
+ shows "rs \<subseteq> snd (simp2 r rs)"
+ apply(induct r rs rule: simp2.induct)
+ apply(auto split: prod.split)
+ apply (metis set_mp test1)
+ by (meson set_mp test2)
+
+lemma test3a:
+ shows "\<Union>(L ` snd (simp2 r rs)) \<subseteq> L(r) \<union> (\<Union> (L ` rs))"
+ apply(induct r rs rule: simp2.induct)
+ apply(auto split: prod.split simp add: Sequ_def)
+ apply (smt UN_iff Un_iff set_mp test1)
+
+
+lemma test:
+ assumes "(\<Union>r' \<in> rs. L r') \<subseteq> L r"
+ shows "L(fst (simp2 r rs)) \<subseteq> L(r)"
+ using assms
+ apply(induct r arbitrary: rs)
+ apply(simp only: simp2.simps)
+ apply(simp)
+ apply(simp only: simp2.simps)
+ apply(simp)
+ apply(simp only: simp2.simps)
+ apply(simp)
+ prefer 3
+ apply(simp only: simp2.simps)
+ apply(simp)
+ prefer 2
+ apply(simp only: simp2.simps)
+ apply(simp)
+ apply(subst prod.split)
+ apply(rule allI)+
+ apply(rule impI)
+ apply(subst prod.split)
+ apply(rule allI)+
+ apply(rule impI)
+ apply(simp)
+ apply(drule_tac x="rs" in meta_spec)
+ apply(simp)
+ apply(drule_tac x="x2" in meta_spec)
+ apply(simp)
+ apply(auto)[1]
+ apply(subgoal_tac "rs \<subseteq> x2a")
+ prefer 2
+ apply (metis order_trans prod.sel(2) test3)
+
+
+ apply(rule antisym)
+ prefer 2
+ apply(simp)
+ apply(rule conjI)
+ apply(drule meta_mp)
+ prefer 2
+ apply(simp)
+ apply(auto)[1]
+ apply(auto)[1]
+
+ thm prod.split
+ apply(auto split: prod.split)[1]
+ apply(drule_tac x="rs" in meta_spec)
+ apply(drule_tac x="rs" in meta_spec)
+ apply(simp)
+ apply(rule_tac x="SEQ x1 x1a" in bexI)
+ apply(simp add: Sequ_def)
+
+ apply(auto)[1]
+ apply(simp)
+ apply(subgoal_tac "L (fst (simp2_SEQ x1 x1a rs)) \<subseteq> L x1 \<union> L x1a")
+ apply(frule_tac x="{}" in meta_spec)
+ apply(rotate_tac 1)
+ apply(frule_tac x="{}" in meta_spec)
+ apply(simp)
+
+
+ apply(rule_tac x="SEQ x1 x1a" in bexI)
+ apply(simp add: Sequ_def)
+ apply(auto)[1]
+ apply(simp)
+ using L.simps(2) apply blast
+ prefer 3
+ apply(simp only: simp2.simps)
+ apply(simp)
+ using L.simps(3) apply blast
+ prefer 2
+ apply(simp add: Sequ_def)
+ apply(auto)[1]
+ oops
end
\ No newline at end of file