updated
authorChristian Urban <urbanc@in.tum.de>
Tue, 15 May 2018 10:24:25 +0100
changeset 283 c4d821c6309d
parent 282 bfab5aded21d
child 284 913205c1fd0d
updated
progs/scala/autos.scala
text/langs.sty
text/proposal.pdf
text/proposal.tex
text/style.sty
thys/Exercises.thy
thys/Simplifying.thy
--- a/progs/scala/autos.scala	Fri Jan 12 02:33:35 2018 +0000
+++ b/progs/scala/autos.scala	Tue May 15 10:24:25 2018 +0100
@@ -383,7 +383,7 @@
 
 */ 
 
-// Regular expressions fro derivative automata
+// Regular expressions for derivative automata
 
 abstract class Rexp
 case object ZERO extends Rexp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/text/langs.sty	Tue May 15 10:24:25 2018 +0100
@@ -0,0 +1,76 @@
+\usepackage{listings}
+\usepackage{etoolbox}
+\setmonofont[Scale=.95]{Consolas}
+\newfontfamily{\consolas}{Consolas}
+
+\definecolor{codered}{rgb}{0.6,0,0}            % for strings
+\definecolor{codegreen}{rgb}{0.25,0.5,0.35}    % comments
+\definecolor{codepurple}{rgb}{0.5,0,0.35}      % keywords
+\definecolor{codedocblue}{rgb}{0.25,0.35,0.75} % doc
+\definecolor{codeblue}{rgb}{0.25,0.35,0.75}    % types
+
+\BeforeBeginEnvironment{lstlisting}{\par\noindent\begin{minipage}{\linewidth}}
+\AfterEndEnvironment{lstlisting}{\end{minipage}\par}
+
+\lstdefinelanguage{Scala}{
+  morekeywords={abstract,case,catch,class,def,%
+    do,else,extends,false,final,finally,%
+    for,if,implicit,import,match,mixin,%
+    new,null,object,override,package,%
+    private,protected,requires,return,sealed,%
+    super,this,throw,trait,true,try,%
+    type,val,var,while,with,yield,write,read},%
+  otherkeywords={=>,<-,<\%,<:,>:,\#},%
+  sensitive=true,%
+  %directives={Int,Char,Rexp,String,Boolean,BigInt,Unit,List,Set},%
+  %moredelim=*[directive]:,%
+  morecomment=[l]{//},%
+  morecomment=[n]{/*}{*/},
+  morestring=[s]{"""}{"""},
+  morestring=[b]",
+  morestring=[b]',
+}[keywords,comments,strings]
+
+\lstdefinelanguage{While}{
+  morekeywords={if,then,else,while,do,true,false,write,upto,read,for,skip},
+  morecomment=[l]{//},
+  morecomment=[n]{/*}{*/},
+  morestring=[b]",
+  otherkeywords={=,!=,:=,<,>,\%;*,/},
+}[keywords,comments,strings]
+
+
+\newcommand{\code}[1]{{\lstinline{#1}}}
+\newcommand{\pcode}[1]{\mbox{\lstset{language={},keywordstyle=\color{black}}\lstinline!#1!}}
+\newcommand{\scode}[1]{\mbox{\lstset{language={},basicstyle=\ttfamily\color{codegreen}}\lstinline!#1!}}
+\makeatother
+
+%%\lstset{escapeinside={(*@}{@*)}}
+\lstset{escapeinside={/*@}{@*/}}
+
+%% stripy code
+\usepackage{lstlinebgrd}
+\definecolor{capri}{rgb}{0.0, 0.75, 1.0}
+
+
+\lstdefinestyle{mystyle}
+       {basicstyle=\ttfamily,
+	keywordstyle=\color{codepurple}\bfseries,
+	stringstyle=\color{codegreen},
+	commentstyle=\color{codegreen},
+	morecomment=[s][\color{codedocblue}]{/**}{*/},
+	numbers=none,
+	numberstyle=\tiny\color{black},
+	stepnumber=1,
+	numbersep=10pt,
+	tabsize=2,
+	showspaces=false,
+	showstringspaces=false,
+        xleftmargin=8mm,
+        emphstyle=\color{codeblue}\bfseries,
+        keepspaces,
+        linebackgroundcolor={\ifodd\value{lstnumber}\color{capri!3}\fi}
+}
+
+\lstset{language=Scala,
+        style=mystyle}
Binary file text/proposal.pdf has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/text/proposal.tex	Tue May 15 10:24:25 2018 +0100
@@ -0,0 +1,429 @@
+\documentclass{article}
+\usepackage{url}
+\usepackage{color}\definecolor{darkblue}{rgb}{0,0,0.5}
+\usepackage[a4paper,colorlinks=true,linkcolor=darkblue,citecolor=darkblue,filecolor=darkblue,pagecolor=darkblue,urlcolor=darkblue]{hyperref}
+\usepackage{style}
+\usepackage{langs}
+\usepackage{tikz}
+\usepackage{pgf}
+\usepackage{pgfplots}
+\usepackage{stackengine}
+
+\addtolength{\oddsidemargin}{-6mm}
+\addtolength{\evensidemargin}{-6mm}
+\addtolength{\textwidth}{12mm}
+
+
+
+%% \usepackage{accents}
+\newcommand\barbelow[1]{\stackunder[1.2pt]{#1}{\raisebox{-4mm}{\boldmath$\uparrow$}}}
+
+\begin{filecontents}{re-python2.data}
+1 0.033
+5 0.036
+10 0.034
+15 0.036
+18 0.059
+19 0.084 
+20 0.141
+21 0.248
+22 0.485
+23 0.878
+24 1.71
+25 3.40
+26 7.08
+27 14.12
+28 26.69
+\end{filecontents}
+
+\begin{filecontents}{re-java.data}
+5  0.00298
+10  0.00418
+15  0.00996
+16  0.01710
+17  0.03492
+18  0.03303
+19  0.05084
+20  0.10177
+21  0.19960
+22  0.41159
+23  0.82234
+24  1.70251
+25  3.36112
+26  6.63998
+27  13.35120
+28  29.81185
+\end{filecontents}
+
+\begin{filecontents}{re-java9.data}
+1000  0.01410
+2000  0.04882
+3000  0.10609
+4000  0.17456
+5000  0.27530
+6000  0.41116
+7000  0.53741
+8000  0.70261
+9000  0.93981
+10000 0.97419
+11000 1.28697
+12000 1.51387
+14000 2.07079
+16000 2.69846
+20000 4.41823
+24000 6.46077
+26000 7.64373
+30000 9.99446
+34000 12.966885
+38000 16.281621
+42000 19.180228
+46000 21.984721
+50000 26.950203
+60000 43.0327746
+\end{filecontents}
+
+
+\begin{document}
+
+  
+\section*{Proposal:\\
+  Fast Regular Expression Matching\\
+  with Brzozowski Derivatives}
+
+\subsubsection*{Summary}
+
+If you want to connect a computer directly to the Internet, it must be
+immediately \mbox{hardened} against outside attacks. The current
+technology for this is to use regular expressions in order to
+automatically scan all incoming network traffic for signs when a
+computer is under attack and if found, to take appropriate
+counter-actions. One possible action could be to slow down the traffic
+from sites where repeated password-guesses originate. Well-known
+network intrusion prevention systems that use regular expressions for
+traffic analysis are Snort and Bro.
+
+Given the large
+volume of Internet traffic even the smallest servers can handle
+nowadays, the regular expressions for traffic analysis have become a
+real bottleneck.  This is not just a nuisance, but actually a security
+vulnerability in itself: it can lead to denial-of-service attacks.
+The proposed project aims to remove this bottleneck by using a little-known
+technique of building derivatives of regular expressions. These
+derivatives have not yet been used in the area of network traffic
+analysis, but have the potential to solve some tenacious problems with
+existing approaches. The project will require theoretical work, but
+also a practical implementation (a proof-of-concept at least).  The
+work will build on earlier work by Ausaf and Urban from King's College
+London \cite{AusafDyckhoffUrban2016}.
+
+
+\subsubsection*{Background}
+
+If every 10 minutes a thief turned up at your car and rattled
+violently at the doorhandles to see if he could get in, you would move
+your car to a safer neighbourhood. Computers, however, need to stay
+connected to the Internet all the time and there they have to withstand such
+attacks. A rather typical instance of an attack can be seen in the
+following log entries from a server at King's:
+
+{\small
+\begin{verbatim}
+ Jan 2 00:53:19 talisker sshd: Received disconnect from 110.53.183.227: [preauth]
+ Jan 2 00:58:31 talisker sshd: Received disconnect from 110.53.183.252: [preauth]
+ Jan 2 01:01:28 talisker sshd: Received disconnect from 221.194.47.236: [preauth]
+ Jan 2 01:03:59 talisker sshd: Received disconnect from 110.53.183.228: [preauth]
+ Jan 2 01:06:53 talisker sshd: Received disconnect from 221.194.47.245: [preauth]
+ ...
+\end{verbatim}}
+  
+
+\noindent
+This is a record of the network activities from the server
+\href{http://talisker.inf.kcl.ac.uk/cgi-bin/repos.cgi}{talisker.inf.kcl.ac.uk},
+which hosts lecture material for students. The log indicates several
+unsuccessful login attempts via the \texttt{ssh} program from
+computers with the addresses \texttt{110.53.183.227} and so on. This
+is a clear sign of a \emph{brute-force attack} where hackers
+systematically try out password candidates.  Such attacks are blunt,
+but unfortunately very powerful.  They can originate from anywhere in
+the World; they are automated and often conducted from computers that
+have been previously hijacked.  Once the attacker ``hits'' on the
+correct password, then he or she gets access to the server. The only
+way to prevent this methodical password-guessing is to spot the
+corresponding traces in the log and then slow down the traffic from
+such sites in a way that perhaps only one password per hour can be
+tried out. This does not affect ``normal'' operations of the server,
+but drastically decreases the chances of hitting the correct password
+by a brute-force attack.
+
+
+Server administrators use \emph{regular expressions} for scanning log
+files. The purpose of these expressions is to specify patterns of
+interest (for example \texttt{Received disconnect from 110.53.183.227}
+where the address needs to be variable). A program will
+then continuously scan for such patterns and trigger an action if a
+pattern is found. Popular examples of such programs are Snort and Bro
+\cite{snort,bro}.  Clearly, there are also other kinds of
+vulnerabilities hackers try to take advantage of---mainly programming
+mistakes that can be abused to gain access to a computer. This means
+server administrators have a suite of sometimes thousands of regular
+expressions, all prescribing some suspicious pattern for when a
+computer is under attack.
+
+
+The underlying algorithmic problem is to decide whether a string in
+the logs matches the patterns determined by the regular
+expressions. Such decisions need to be done as fast as possible,
+otherwise the scanning programs would not be useful as a hardening
+technique for servers. The quest for speed with these decisions is
+presently a rather big challenge for the amount of traffic typical
+serves have to cope with and the large number of patterns that might
+be of interest.  The problem is that when thousands of regular
+expressions are involved, the process of detecting whether a string
+matches a regular expression can be excruciating slow. This might not
+happen in most instances, but in some small number of
+instances. However in these small number of instances the algorithm
+for regular expression matching can grind to a halt.  This phenomenon
+is called \emph{catastrophic backtracking} \cite{catast}.
+Unfortunately, catastrophic backtracking is not just a nuisance, but a
+security vulnerability in itself. The reason is that attackers look
+for these instances where the scan is very slow and then trigger a
+\emph{denial-of-service attack} against the server\ldots meaning the
+server will not be reachable anymore to normal users.
+
+Existing approaches try to mitigate the speed problem with regular
+expressions by implementing the matching algorithms in hardware,
+rather than in software \cite{hardware}. Although this solution offers
+speed, it is often unappealing because attack patterns can change or
+need to be augmented. A software solution is clearly more desirable in
+these circumstances.  Also given that regular expressions were
+introduced in 1950 by Kleene, one might think regular expressions have
+since been studied and implemented to death. But this would definitely
+be a mistake: in fact they are still an active research area and
+off-the-shelf implementations are still extremely prone to the
+problem of catastrophic backtracking. This can be seen in the
+following two graphs:
+
+\begin{center}
+\begin{tabular}{@{}cc@{}}
+%\multicolumn{2}{c}{Graph: $(a^*)^*\cdot b$ and strings 
+%           $\underbrace{a\ldots a}_{n}$}\smallskip\\  
+\begin{tikzpicture}
+\begin{axis}[
+    xlabel={$n$},
+    x label style={at={(1.05,0.0)}},
+    ylabel={time in secs},
+    y label style={at={(0.06,0.5)}},
+    enlargelimits=false,
+    xtick={0,5,...,30},
+    xmax=33,
+    ymax=45,
+    ytick={0,10,...,40},
+    scaled ticks=false,
+    axis lines=left,
+    width=6cm,
+    height=4.5cm, 
+    legend entries={Python, Java 8},  
+    legend pos=north west]
+\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
+\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
+\end{axis}
+\end{tikzpicture}
+  & 
+\begin{tikzpicture}
+\begin{axis}[
+    xlabel={$n$},
+    x label style={at={(1.05,0.0)}},
+    ylabel={time in secs},
+    y label style={at={(0.06,0.5)}},
+    %enlargelimits=false,
+    %xtick={0,5000,...,30000},
+    xmax=65000,
+    ymax=45,
+    ytick={0,10,...,40},
+    scaled ticks=false,
+    axis lines=left,
+    width=6cm,
+    height=4.5cm, 
+    legend entries={Java 9},  
+    legend pos=north west]
+\addplot[cyan,mark=*, mark options={fill=white}] table {re-java9.data};
+\end{axis}
+\end{tikzpicture}
+\end{tabular}  
+\end{center}
+
+\noindent
+These graphs show how long strings can be when using the rather simple
+regular expression $(a^*)^* \,b$ and the built-in regular expression
+matchers in the popular programming languages Python and Java (Version
+8 and Version 9). There are many more regular expressions that show
+the same bahaviour.  On the left-hand side the graphs show that for a
+string consisting of just 28 $a\,$'s, Python and the older Java (which
+was the latest version of Java until September 2017) already need 30
+seconds to decide whether this string is matched or not. This is an
+abysmal result given that Python and Java are very popular programming
+languages. We already know that this problem can be decided much, much
+faster.  If these slow regular expression matchers would be employed
+in network traffic analysis, then this example is already an
+opportunity for mounting an easy denial-of-service attack: one just
+has to send to the server a large enough string of a's. The newer
+version of Java is better---it can match strings of around 50,000
+$a\,$'s in 30 seconds---however this would still not be good enough
+for being a useful tool for network traffic analysis. What is
+interesting is that even a relatively simple-minded regular expression
+matcher based on Brzozowski derivatives can out-compete the regular
+expressions matchers in Java and Python on such catastrophic
+backtracking examples. This gain in speed is the motivation and
+starting point for the proposed project. \smallskip
+
+\subsection*{Research Plan}
+
+Regular expressions are already an old and well-studied topic in
+Computer Science. They were originally introduced by Kleene in 1951
+and are utilised in text search algorithms for ``find'' or ``find and
+replace'' operations \cite{Brian,Kleene51}. Because of their wide
+range of applications, many programming languages provide capabilities
+for regular expression matching via libraries.  The classic theory
+of regular expressions involves just 6 different kinds of regular
+expressions, often defined in terms of the following grammar:
+
+\begin{center}\small
+\begin{tabular}{lcll}
+  $r$ & $::=$ & $\ZERO$     & cannot match anything\\
+      &   $|$ & $\ONE$      & can only match the empty string\\
+      &   $|$ & $c$         & can match a single character (in this case $c$)\\
+      &   $|$ & $r_1 + r_2$ & can match a string either with $r_1$ or with $r_2$\\
+  &   $|$ & $r_1\cdot r_2$ & can match the first part of a string with $r_1$ and\\
+          &  & & then the second part with $r_2$\\
+      &   $|$ & $r^*$       & can match zero or more times $r$\\
+\end{tabular}
+\end{center}
+
+\noindent
+For practical purposes, however, regular expressions have been
+extended in various ways in order to make them even more powerful and
+even more useful for applications. Some of the problems to do with
+catastrophic backtracking stem, unfortunately, from these extensions.
+
+The crux in this project is to not use automata for regular expression
+matching (the traditional technique), but to use derivatives of
+regular expressions instead.  These derivatives were introduced by
+Brzozowski in 1964 \cite{Brzozowski1964}, but somehow had been lost
+``in the sands of time'' \cite{Owens2009}. However, they recently have
+become again a ``hot'' research topic with numerous research papers
+appearing in the last five years. One reason for this interest is that
+Brzozowski derivatives straightforwardly extend to more general
+regular expression constructors, which cannot be easily treated with
+standard techniques. They can also be implemented with ease in any
+functional programming language: the definition for derivatives
+consists of just two simple recursive functions shown in
+Figure~\ref{f}. Moreover, it can be easily shown (by a mathematical
+proof) that the resulting regular matcher is in fact always producing
+a correct answer. This is an area where Urban can provide a lot of
+expertise for this project: he is one of the main developers of the
+Isabelle theorem prover, which is a program designed for such proofs
+\cite{Isabelle}.
+
+
+\begin{figure}[t]
+\begin{center}\small
+\begin{tabular}{lcl}
+$\textit{nullable}(\ZERO)$ & $\dn$ & $\textit{false}$\\
+$\textit{nullable}(\ONE)$  & $\dn$ & $\textit{true}$\\
+$\textit{nullable}(c)$     & $\dn$ & $\textit{false}$\\
+$\textit{nullable}(r_1 + r_2)$ & $\dn$ & $\textit{nullable}(r_1) \vee \textit{nullable}(r_2)$\\
+$\textit{nullable}(r_1 \cdot r_2)$ & $\dn$ & $\textit{nullable}(r_1) \wedge \textit{nullable}(r_2)$\\
+$\textit{nullable}(r^*)$ & $\dn$ & $\textit{true}$\medskip\\
+
+$\textit{der}\;c\;(\ZERO)$ & $\dn$ & $\ZERO$\\
+$\textit{der}\;c\;(\ONE)$  & $\dn$ & $\ZERO$\\
+$\textit{der}\;c\;(d)$     & $\dn$ & $\textit{if}\; c = d\;\textit{then} \;\ONE \; \textit{else} \;\ZERO$\\
+$\textit{der}\;c\;(r_1 + r_2)$ & $\dn$ & $(\textit{der}\;c\;r_1) + (\textit{der}\;c\;r_2)$\\
+$\textit{der}\;c\;(r_1 \cdot r_2)$ & $\dn$ & $\textit{if}\;\textit{nullable}(r_1)$\\
+      & & $\textit{then}\;((\textit{der}\;c\;r_1)\cdot r_2) + (\textit{der}\;c\;r_2)$\\
+      & & $\textit{else}\;(\textit{der}\;c\;r_1)\cdot r_2$\\
+$\textit{der}\;c\;(r^*)$ & $\dn$ & $(\textit{der}\;c\;r)\cdot (r^*)$\\[-5mm]
+\end{tabular}
+\end{center}
+\caption{The complete definition of derivatives of regular expressions
+  \cite{Brzozowski1964}.
+  This definition can be implemented with ease in functional programming languages and can be easily extended to regular expressions used in practice.
+  The are more flexible for applications and easier to manipulate in
+  mathematical proofs, than traditional techniques for regular expression
+  matching.\medskip\label{f}}
+\end{figure}
+
+There are a number of avenues for research on Brzozowski derivatives.
+One problem I like to immediately tackle in this project is how to
+handle \emph{back-references} in regular expressions. It is known that
+the inclusion of back-references causes the underlying algorithmic
+problem to become NP-hard \cite{Aho}, and is the main reason why
+regular expression matchers suffer from the catastrophic backtracking
+problem. However, the full generality of back-references are not
+needed in practical applications. The goal therefore is to sufficiently
+restrict the problem so that an efficient algorithm can be
+designed. There seem to be good indications that Brzozowski
+derivatives are useful for that.
+
+Another problem is \emph{how} regular expressions match a string
+\cite{Sulzmann2014}. In this case the algorithm does not just give a
+yes/no answer about whether the regular expression matches the string,
+but also generates a value that encodes which part of the string is
+matched by which part of the regular expression. This is important in
+applications, like network analysis where from a matching log entry a
+specific computer address needs to be extracted.  Also compilers make
+extensive use of such an extension when they lex their source
+programs, i.e.~break up code into word-like components.  Again Ausaf
+and Urban from King's made some initial progress about proving the
+correctness of such an extension, but their algorithm is not yet fast
+enough for practical purposes \cite{AusafDyckhoffUrban2016}. It would
+be desirable to study optimisations that make the algorithm faster,
+but preserve the correctness guaranties obtained by Ausaf and Urban.
+
+\mbox{}\\[-11mm]
+
+\subsubsection*{Conclusion}
+
+Much research has already gone into regular expressions, and one might
+think they have been studied and implemented to death. But this is far
+from the truth. In fact regular expressions have become a ``hot''
+research topic in recent years because of problems with the
+traditional methods (in applications such as network traffic
+analysis), but also because the technique of derivatives of regular
+expression has been re-discovered. These derivatives provide an
+alternative approach to regular expression matching. Their potential
+lies in the fact that they are more flexible in implementations and
+much easier to manipulate in mathematical proofs. Therefore I like to
+research them in this project.\medskip
+
+\noindent {\bf Impact:} Regular expression matching is a core
+technology in Computer Science with numerous applications. I will
+focus on the area of network traffic analysis and also lexical
+analysis. In these areas this project can have a quite large impact:
+for example the program Bro has been downloaded at least 10,000 times and Snort
+even has 5 million downloads and over 600,000 registered users
+\cite{snort,bro}. Lexical analysis is a core component in every
+compiler, which are the bedrock on which nearly all programming is
+built on.
+
+% {\small
+% \begin{itemize}
+% \item[$\bullet$] \url{http://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}
+% \item[$\bullet$] \url{https://vimeo.com/112065252}
+% \item[$\bullet$] \url{http://davidvgalbraith.com/how-i-fixed-atom/}  
+% \end{itemize}}
+\mbox{}\\[-11mm]
+
+\small
+\bibliographystyle{plain}
+\renewcommand{\refname}{References\\[-7mm]\mbox{}}
+\bibliography{proposal}
+
+\end{document}
+
+
+%%% Local Variables: 
+%%% mode: latex
+%%% TeX-master: t
+%%% End: 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/text/style.sty	Tue May 15 10:24:25 2018 +0100
@@ -0,0 +1,79 @@
+\usepackage{xcolor}
+%%\usepackage{fontspec}
+\usepackage[sc]{mathpazo}
+\usepackage{fontspec}
+\setmainfont[Ligatures=TeX]{Palatino Linotype}
+\usepackage{amssymb}
+\usepackage{amsmath}
+\usepackage{menukeys}
+\definecolor{darkblue}{rgb}{0,0,0.6}
+\usepackage[colorlinks=true,urlcolor=darkblue,linkcolor=darkblue]{hyperref}
+
+%%% for regular expressions and values
+\newcommand{\ZERO}{\mbox{\bf 0}}
+\newcommand{\ONE}{\mbox{\bf 1}}
+\newcommand{\Left}{\textit{Left}}
+\newcommand{\Der}{\textit{Der}}
+\newcommand{\der}{\textit{der}}
+\newcommand{\Ders}{\textit{Ders}}
+\newcommand{\ders}{\textit{ders}}
+
+%%% for trees
+%% http://anorien.csc.warwick.ac.uk/mirrors/CTAN/graphics/pgf/contrib/forest/forest.pdf
+
+\newcommand\grid[1]{%
+\begin{tikzpicture}[baseline=(char.base)]
+  \path[use as bounding box]
+    (0,0) rectangle (1em,1em);
+  \draw[red!50, fill=red!20]
+    (0,0) rectangle (1em,1em);
+  \node[inner sep=1pt,anchor=base west]
+    (char) at (0em,\gridraiseamount) {#1};
+\end{tikzpicture}}
+\newcommand\gridraiseamount{0.12em}
+
+\makeatletter
+\newcommand\Grid[1]{%
+  \@tfor\z:=#1\do{\grid{\z}}}
+\makeatother	
+
+\newcommand\Vspace[1][.3em]{%
+  \mbox{\kern.06em\vrule height.3ex}%
+  \vbox{\hrule width#1}%
+  \hbox{\vrule height.3ex}}
+
+\def\VS{\Vspace[0.6em]}
+
+
+\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}
+\newcommand{\defn}[1]{\textit{\textbf{#1}}}
+\newcommand{\dq}[1]{\mbox{\tt{"}}#1\mbox{\tt{"}}}
+
+\definecolor{codegray}{gray}{0.9}
+
+\makeatletter
+\def\fnote{\gdef\@thefnmark{}\@footnotetext}
+\makeatother
+
+\newcommand{\HEADER}{{\bf Please submit your solutions via email. Please submit 
+only ASCII text or PDFs. Every solution should be preceeded by the corresponding 
+question text, like:
+
+\begin{center}
+\begin{tabular}{ll}
+Q$n$: & \ldots a difficult question from me\ldots\\
+A:    & \ldots an answer from you \ldots\\
+Q$n+1$ & \ldots another difficult question\ldots\\
+A:     & \ldots another brilliant answer from you\ldots
+\end{tabular}
+\end{center}
+
+\noindent Solutions will only be accepted until 20th December! Please send only
+one homework per email.}\bigskip}
+
+\newcommand{\POSTSCRIPT}{
+{\bf (Optional)} This question is for you to provide 
+regular feedback to me: for example
+what were the most interesting, least interesting, or confusing 
+parts in this lecture? Any problems with my Scala code? Please 
+feel free to share any other questions or concerns.} 
--- a/thys/Exercises.thy	Fri Jan 12 02:33:35 2018 +0000
+++ b/thys/Exercises.thy	Tue May 15 10:24:25 2018 +0100
@@ -30,6 +30,8 @@
      zeroable r1 \<or> zeroable r2 \<or> (atmostempty r1 \<and> atmostempty r2)"
 | "atmostempty (STAR r) = atmostempty r"
 
+
+
 fun
  somechars :: "rexp \<Rightarrow> bool"
 where
@@ -50,8 +52,7 @@
 apply blast
 apply(auto)
 using Star_cstring
-by (metis concat_eq_Nil_conv) 
-
+  by (metis concat_eq_Nil_conv) 
 
 lemma atmostempty_correctness_aux:
   shows "atmostempty r \<longleftrightarrow> \<not> somechars r"
@@ -65,6 +66,33 @@
 by(auto simp add: atmostempty_correctness_aux somechars_correctness)
 
 fun
+ leastsinglechar :: "rexp \<Rightarrow> bool"
+where
+  "leastsinglechar (ZERO) \<longleftrightarrow> False"
+| "leastsinglechar (ONE) \<longleftrightarrow> False"
+| "leastsinglechar (CHAR c) \<longleftrightarrow> True"
+| "leastsinglechar (ALT r1 r2) \<longleftrightarrow> leastsinglechar r1 \<or> leastsinglechar r2"
+| "leastsinglechar (SEQ r1 r2) \<longleftrightarrow> 
+      (if (zeroable r1 \<or> zeroable r2) then False
+       else ((nullable r1 \<and> leastsinglechar r2) \<or> (nullable r2 \<and> leastsinglechar r1)))"
+| "leastsinglechar (STAR r) \<longleftrightarrow> leastsinglechar r"
+
+lemma leastsinglechar_correctness:
+  "leastsinglechar r \<longleftrightarrow> (\<exists>c. [c] \<in> L r)"
+  apply(induct r)
+  apply(simp)
+  apply(simp)
+  apply(simp)
+  prefer 2
+  apply(simp)
+  apply(blast)
+  prefer 2
+  apply(simp)
+  using Star.step Star_decomp apply fastforce
+  apply(simp add: Sequ_def zeroable_correctness nullable_correctness)
+  by (metis append_Nil append_is_Nil_conv butlast_append butlast_snoc)
+
+fun
  infinitestrings :: "rexp \<Rightarrow> bool"
 where
   "infinitestrings (ZERO) = False"
--- a/thys/Simplifying.thy	Fri Jan 12 02:33:35 2018 +0000
+++ b/thys/Simplifying.thy	Tue May 15 10:24:25 2018 +0100
@@ -32,6 +32,7 @@
 | "simp_ALT (r\<^sub>1, f\<^sub>1) (ZERO, f\<^sub>2) = (r\<^sub>1, F_LEFT f\<^sub>1)"
 | "simp_ALT (r\<^sub>1, f\<^sub>1) (r\<^sub>2, f\<^sub>2) = (ALT r\<^sub>1 r\<^sub>2, F_ALT f\<^sub>1 f\<^sub>2)"
 
+(* where is ZERO *)
 fun simp_SEQ where
   "simp_SEQ (ONE, f\<^sub>1) (r\<^sub>2, f\<^sub>2) = (r\<^sub>2, F_SEQ1 f\<^sub>1 f\<^sub>2)"
 | "simp_SEQ (r\<^sub>1, f\<^sub>1) (ONE, f\<^sub>2) = (r\<^sub>1, F_SEQ2 f\<^sub>1 f\<^sub>2)"
@@ -227,6 +228,144 @@
        ultimately show "slexer r (c # s) = lexer r (c # s)" 
          by (simp del: slexer.simps add: slexer_better_simp)
    qed
-qed  
+ qed  
+
+
+fun simp2_ALT where
+  "simp2_ALT ZERO r2 seen = (r2, seen)"
+| "simp2_ALT r1 ZERO seen = (r1, seen)"
+| "simp2_ALT r1 r2 seen = (ALT r1 r2, seen)"
+
+fun simp2_SEQ where
+  "simp2_SEQ ZERO r2 seen = (ZERO, seen)"
+| "simp2_SEQ r1 ZERO seen = (ZERO, seen)"
+| "simp2_SEQ ONE r2 seen = (r2, seen \<union> {r2})"
+| "simp2_SEQ r1 ONE seen = (r1, seen \<union> {r1})"
+| "simp2_SEQ r1 r2 seen = (SEQ r1 r2, seen \<union> {SEQ r1 r2})"  
+
+
+
+
+fun 
+  simp2 :: "rexp \<Rightarrow> rexp set \<Rightarrow> rexp * rexp set"
+where
+  "simp2 (ALT r1 r2) seen = 
+      (let (r1s, seen1) = simp2 r1 seen in 
+       let (r2s, seen2) = simp2 r2 seen1 
+       in simp2_ALT r1s r2s seen2)" 
+| "simp2 (SEQ r1 r2) seen = 
+      (let (r1s, _) = simp2 r1 {} in 
+       let (r2s, _) = simp2 r2 {} 
+       in if (SEQ r1s r2s \<in> seen) then (ZERO, seen)
+       else simp2_SEQ r1s r2s seen)"
+| "simp2 r seen = (if r \<in> seen then (ZERO, seen) else (r, seen \<union> {r}))"
+
+
+lemma simp2_ALT[simp]: 
+  shows "L (fst (simp2_ALT r1 r2 seen)) = L r1 \<union> L r2"
+  apply(induct r1 r2 seen rule: simp2_ALT.induct)
+  apply(simp_all)
+  done
+
+lemma test1:
+  shows "snd (simp2_ALT r1 r2 rs) = rs"
+  apply(induct r1 r2 rs rule: simp2_ALT.induct)
+  apply(auto)
+  done
+
+lemma test2:
+  shows "rs \<subseteq> snd (simp2_SEQ r1 r2 rs)"
+  apply(induct r1 r2 rs rule: simp2_SEQ.induct)
+  apply(auto)
+  done
+
+lemma test3:
+  shows "rs \<subseteq> snd (simp2 r rs)"
+  apply(induct r rs rule: simp2.induct)
+  apply(auto split: prod.split)
+  apply (metis set_mp test1)
+  by (meson set_mp test2)
+  
+lemma test3a:
+  shows "\<Union>(L ` snd (simp2 r rs)) \<subseteq> L(r) \<union> (\<Union> (L ` rs))"
+  apply(induct r rs rule: simp2.induct)
+  apply(auto split: prod.split simp add: Sequ_def)
+  apply (smt UN_iff Un_iff set_mp test1)
+    
+
+lemma test:
+  assumes "(\<Union>r' \<in> rs. L r') \<subseteq> L r"
+  shows "L(fst (simp2 r rs)) \<subseteq> L(r)"
+  using assms
+  apply(induct r arbitrary: rs)
+  apply(simp only: simp2.simps)
+  apply(simp)
+  apply(simp only: simp2.simps)
+  apply(simp)
+  apply(simp only: simp2.simps)
+  apply(simp)
+  prefer 3
+  apply(simp only: simp2.simps)
+  apply(simp)
+  prefer 2
+  apply(simp only: simp2.simps)
+   apply(simp)
+   apply(subst prod.split)
+  apply(rule allI)+
+   apply(rule impI)
+  apply(subst prod.split)
+  apply(rule allI)+
+   apply(rule impI)
+   apply(simp)
+   apply(drule_tac x="rs" in meta_spec)
+  apply(simp)
+  apply(drule_tac x="x2" in meta_spec)
+   apply(simp)
+   apply(auto)[1]
+    apply(subgoal_tac "rs \<subseteq> x2a")
+     prefer 2
+  apply (metis order_trans prod.sel(2) test3)
+  
+  
+   apply(rule antisym)
+  prefer 2
+    apply(simp)
+    apply(rule conjI)
+     apply(drule meta_mp)
+  prefer 2
+      apply(simp)
+      apply(auto)[1]
+    apply(auto)[1]
+  
+  thm prod.split
+   apply(auto split: prod.split)[1]
+     apply(drule_tac x="rs" in meta_spec)
+     apply(drule_tac x="rs" in meta_spec)
+     apply(simp)
+     apply(rule_tac x="SEQ x1 x1a" in bexI)
+      apply(simp add: Sequ_def)
+
+      apply(auto)[1]
+     apply(simp)
+  apply(subgoal_tac "L (fst (simp2_SEQ x1 x1a rs)) \<subseteq> L x1 \<union> L x1a")
+     apply(frule_tac x="{}" in meta_spec)
+  apply(rotate_tac 1)
+     apply(frule_tac x="{}" in meta_spec)
+     apply(simp)
+   
+    
+     apply(rule_tac x="SEQ x1 x1a" in bexI)
+      apply(simp add: Sequ_def)
+      apply(auto)[1]
+  apply(simp)
+  using L.simps(2) apply blast
+  prefer 3
+  apply(simp only: simp2.simps)
+    apply(simp)
+  using L.simps(3) apply blast
+  prefer 2
+   apply(simp add: Sequ_def)
+   apply(auto)[1]
+  oops  
 
 end
\ No newline at end of file