# HG changeset patch # User Christian Urban # Date 1526376265 -3600 # Node ID c4d821c6309d9006c34f6766245483076297bc2d # Parent bfab5aded21d7abcaec69c5c9e71ad0af55a10dc updated diff -r bfab5aded21d -r c4d821c6309d progs/scala/autos.scala --- a/progs/scala/autos.scala Fri Jan 12 02:33:35 2018 +0000 +++ b/progs/scala/autos.scala Tue May 15 10:24:25 2018 +0100 @@ -383,7 +383,7 @@ */ -// Regular expressions fro derivative automata +// Regular expressions for derivative automata abstract class Rexp case object ZERO extends Rexp diff -r bfab5aded21d -r c4d821c6309d text/langs.sty --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/text/langs.sty Tue May 15 10:24:25 2018 +0100 @@ -0,0 +1,76 @@ +\usepackage{listings} +\usepackage{etoolbox} +\setmonofont[Scale=.95]{Consolas} +\newfontfamily{\consolas}{Consolas} + +\definecolor{codered}{rgb}{0.6,0,0} % for strings +\definecolor{codegreen}{rgb}{0.25,0.5,0.35} % comments +\definecolor{codepurple}{rgb}{0.5,0,0.35} % keywords +\definecolor{codedocblue}{rgb}{0.25,0.35,0.75} % doc +\definecolor{codeblue}{rgb}{0.25,0.35,0.75} % types + +\BeforeBeginEnvironment{lstlisting}{\par\noindent\begin{minipage}{\linewidth}} +\AfterEndEnvironment{lstlisting}{\end{minipage}\par} + +\lstdefinelanguage{Scala}{ + morekeywords={abstract,case,catch,class,def,% + do,else,extends,false,final,finally,% + for,if,implicit,import,match,mixin,% + new,null,object,override,package,% + private,protected,requires,return,sealed,% + super,this,throw,trait,true,try,% + type,val,var,while,with,yield,write,read},% + otherkeywords={=>,<-,<\%,<:,>:,\#},% + sensitive=true,% + %directives={Int,Char,Rexp,String,Boolean,BigInt,Unit,List,Set},% + %moredelim=*[directive]:,% + morecomment=[l]{//},% + morecomment=[n]{/*}{*/}, + morestring=[s]{"""}{"""}, + morestring=[b]", + morestring=[b]', +}[keywords,comments,strings] + +\lstdefinelanguage{While}{ + morekeywords={if,then,else,while,do,true,false,write,upto,read,for,skip}, + morecomment=[l]{//}, + morecomment=[n]{/*}{*/}, + morestring=[b]", + otherkeywords={=,!=,:=,<,>,\%;*,/}, +}[keywords,comments,strings] + + +\newcommand{\code}[1]{{\lstinline{#1}}} +\newcommand{\pcode}[1]{\mbox{\lstset{language={},keywordstyle=\color{black}}\lstinline!#1!}} +\newcommand{\scode}[1]{\mbox{\lstset{language={},basicstyle=\ttfamily\color{codegreen}}\lstinline!#1!}} +\makeatother + +%%\lstset{escapeinside={(*@}{@*)}} +\lstset{escapeinside={/*@}{@*/}} + +%% stripy code +\usepackage{lstlinebgrd} +\definecolor{capri}{rgb}{0.0, 0.75, 1.0} + + +\lstdefinestyle{mystyle} + {basicstyle=\ttfamily, + keywordstyle=\color{codepurple}\bfseries, + stringstyle=\color{codegreen}, + commentstyle=\color{codegreen}, + morecomment=[s][\color{codedocblue}]{/**}{*/}, + numbers=none, + numberstyle=\tiny\color{black}, + stepnumber=1, + numbersep=10pt, + tabsize=2, + showspaces=false, + showstringspaces=false, + xleftmargin=8mm, + emphstyle=\color{codeblue}\bfseries, + keepspaces, + linebackgroundcolor={\ifodd\value{lstnumber}\color{capri!3}\fi} +} + +\lstset{language=Scala, + style=mystyle} diff -r bfab5aded21d -r c4d821c6309d text/proposal.pdf Binary file text/proposal.pdf has changed diff -r bfab5aded21d -r c4d821c6309d text/proposal.tex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/text/proposal.tex Tue May 15 10:24:25 2018 +0100 @@ -0,0 +1,429 @@ +\documentclass{article} +\usepackage{url} +\usepackage{color}\definecolor{darkblue}{rgb}{0,0,0.5} +\usepackage[a4paper,colorlinks=true,linkcolor=darkblue,citecolor=darkblue,filecolor=darkblue,pagecolor=darkblue,urlcolor=darkblue]{hyperref} +\usepackage{style} +\usepackage{langs} +\usepackage{tikz} +\usepackage{pgf} +\usepackage{pgfplots} +\usepackage{stackengine} + +\addtolength{\oddsidemargin}{-6mm} +\addtolength{\evensidemargin}{-6mm} +\addtolength{\textwidth}{12mm} + + + +%% \usepackage{accents} +\newcommand\barbelow[1]{\stackunder[1.2pt]{#1}{\raisebox{-4mm}{\boldmath$\uparrow$}}} + +\begin{filecontents}{re-python2.data} +1 0.033 +5 0.036 +10 0.034 +15 0.036 +18 0.059 +19 0.084 +20 0.141 +21 0.248 +22 0.485 +23 0.878 +24 1.71 +25 3.40 +26 7.08 +27 14.12 +28 26.69 +\end{filecontents} + +\begin{filecontents}{re-java.data} +5 0.00298 +10 0.00418 +15 0.00996 +16 0.01710 +17 0.03492 +18 0.03303 +19 0.05084 +20 0.10177 +21 0.19960 +22 0.41159 +23 0.82234 +24 1.70251 +25 3.36112 +26 6.63998 +27 13.35120 +28 29.81185 +\end{filecontents} + +\begin{filecontents}{re-java9.data} +1000 0.01410 +2000 0.04882 +3000 0.10609 +4000 0.17456 +5000 0.27530 +6000 0.41116 +7000 0.53741 +8000 0.70261 +9000 0.93981 +10000 0.97419 +11000 1.28697 +12000 1.51387 +14000 2.07079 +16000 2.69846 +20000 4.41823 +24000 6.46077 +26000 7.64373 +30000 9.99446 +34000 12.966885 +38000 16.281621 +42000 19.180228 +46000 21.984721 +50000 26.950203 +60000 43.0327746 +\end{filecontents} + + +\begin{document} + + +\section*{Proposal:\\ + Fast Regular Expression Matching\\ + with Brzozowski Derivatives} + +\subsubsection*{Summary} + +If you want to connect a computer directly to the Internet, it must be +immediately \mbox{hardened} against outside attacks. The current +technology for this is to use regular expressions in order to +automatically scan all incoming network traffic for signs when a +computer is under attack and if found, to take appropriate +counter-actions. One possible action could be to slow down the traffic +from sites where repeated password-guesses originate. Well-known +network intrusion prevention systems that use regular expressions for +traffic analysis are Snort and Bro. + +Given the large +volume of Internet traffic even the smallest servers can handle +nowadays, the regular expressions for traffic analysis have become a +real bottleneck. This is not just a nuisance, but actually a security +vulnerability in itself: it can lead to denial-of-service attacks. +The proposed project aims to remove this bottleneck by using a little-known +technique of building derivatives of regular expressions. These +derivatives have not yet been used in the area of network traffic +analysis, but have the potential to solve some tenacious problems with +existing approaches. The project will require theoretical work, but +also a practical implementation (a proof-of-concept at least). The +work will build on earlier work by Ausaf and Urban from King's College +London \cite{AusafDyckhoffUrban2016}. + + +\subsubsection*{Background} + +If every 10 minutes a thief turned up at your car and rattled +violently at the doorhandles to see if he could get in, you would move +your car to a safer neighbourhood. Computers, however, need to stay +connected to the Internet all the time and there they have to withstand such +attacks. A rather typical instance of an attack can be seen in the +following log entries from a server at King's: + +{\small +\begin{verbatim} + Jan 2 00:53:19 talisker sshd: Received disconnect from 110.53.183.227: [preauth] + Jan 2 00:58:31 talisker sshd: Received disconnect from 110.53.183.252: [preauth] + Jan 2 01:01:28 talisker sshd: Received disconnect from 221.194.47.236: [preauth] + Jan 2 01:03:59 talisker sshd: Received disconnect from 110.53.183.228: [preauth] + Jan 2 01:06:53 talisker sshd: Received disconnect from 221.194.47.245: [preauth] + ... +\end{verbatim}} + + +\noindent +This is a record of the network activities from the server +\href{http://talisker.inf.kcl.ac.uk/cgi-bin/repos.cgi}{talisker.inf.kcl.ac.uk}, +which hosts lecture material for students. The log indicates several +unsuccessful login attempts via the \texttt{ssh} program from +computers with the addresses \texttt{110.53.183.227} and so on. This +is a clear sign of a \emph{brute-force attack} where hackers +systematically try out password candidates. Such attacks are blunt, +but unfortunately very powerful. They can originate from anywhere in +the World; they are automated and often conducted from computers that +have been previously hijacked. Once the attacker ``hits'' on the +correct password, then he or she gets access to the server. The only +way to prevent this methodical password-guessing is to spot the +corresponding traces in the log and then slow down the traffic from +such sites in a way that perhaps only one password per hour can be +tried out. This does not affect ``normal'' operations of the server, +but drastically decreases the chances of hitting the correct password +by a brute-force attack. + + +Server administrators use \emph{regular expressions} for scanning log +files. The purpose of these expressions is to specify patterns of +interest (for example \texttt{Received disconnect from 110.53.183.227} +where the address needs to be variable). A program will +then continuously scan for such patterns and trigger an action if a +pattern is found. Popular examples of such programs are Snort and Bro +\cite{snort,bro}. Clearly, there are also other kinds of +vulnerabilities hackers try to take advantage of---mainly programming +mistakes that can be abused to gain access to a computer. This means +server administrators have a suite of sometimes thousands of regular +expressions, all prescribing some suspicious pattern for when a +computer is under attack. + + +The underlying algorithmic problem is to decide whether a string in +the logs matches the patterns determined by the regular +expressions. Such decisions need to be done as fast as possible, +otherwise the scanning programs would not be useful as a hardening +technique for servers. The quest for speed with these decisions is +presently a rather big challenge for the amount of traffic typical +serves have to cope with and the large number of patterns that might +be of interest. The problem is that when thousands of regular +expressions are involved, the process of detecting whether a string +matches a regular expression can be excruciating slow. This might not +happen in most instances, but in some small number of +instances. However in these small number of instances the algorithm +for regular expression matching can grind to a halt. This phenomenon +is called \emph{catastrophic backtracking} \cite{catast}. +Unfortunately, catastrophic backtracking is not just a nuisance, but a +security vulnerability in itself. The reason is that attackers look +for these instances where the scan is very slow and then trigger a +\emph{denial-of-service attack} against the server\ldots meaning the +server will not be reachable anymore to normal users. + +Existing approaches try to mitigate the speed problem with regular +expressions by implementing the matching algorithms in hardware, +rather than in software \cite{hardware}. Although this solution offers +speed, it is often unappealing because attack patterns can change or +need to be augmented. A software solution is clearly more desirable in +these circumstances. Also given that regular expressions were +introduced in 1950 by Kleene, one might think regular expressions have +since been studied and implemented to death. But this would definitely +be a mistake: in fact they are still an active research area and +off-the-shelf implementations are still extremely prone to the +problem of catastrophic backtracking. This can be seen in the +following two graphs: + +\begin{center} +\begin{tabular}{@{}cc@{}} +%\multicolumn{2}{c}{Graph: $(a^*)^*\cdot b$ and strings +% $\underbrace{a\ldots a}_{n}$}\smallskip\\ +\begin{tikzpicture} +\begin{axis}[ + xlabel={$n$}, + x label style={at={(1.05,0.0)}}, + ylabel={time in secs}, + y label style={at={(0.06,0.5)}}, + enlargelimits=false, + xtick={0,5,...,30}, + xmax=33, + ymax=45, + ytick={0,10,...,40}, + scaled ticks=false, + axis lines=left, + width=6cm, + height=4.5cm, + legend entries={Python, Java 8}, + legend pos=north west] +\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data}; +\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data}; +\end{axis} +\end{tikzpicture} + & +\begin{tikzpicture} +\begin{axis}[ + xlabel={$n$}, + x label style={at={(1.05,0.0)}}, + ylabel={time in secs}, + y label style={at={(0.06,0.5)}}, + %enlargelimits=false, + %xtick={0,5000,...,30000}, + xmax=65000, + ymax=45, + ytick={0,10,...,40}, + scaled ticks=false, + axis lines=left, + width=6cm, + height=4.5cm, + legend entries={Java 9}, + legend pos=north west] +\addplot[cyan,mark=*, mark options={fill=white}] table {re-java9.data}; +\end{axis} +\end{tikzpicture} +\end{tabular} +\end{center} + +\noindent +These graphs show how long strings can be when using the rather simple +regular expression $(a^*)^* \,b$ and the built-in regular expression +matchers in the popular programming languages Python and Java (Version +8 and Version 9). There are many more regular expressions that show +the same bahaviour. On the left-hand side the graphs show that for a +string consisting of just 28 $a\,$'s, Python and the older Java (which +was the latest version of Java until September 2017) already need 30 +seconds to decide whether this string is matched or not. This is an +abysmal result given that Python and Java are very popular programming +languages. We already know that this problem can be decided much, much +faster. If these slow regular expression matchers would be employed +in network traffic analysis, then this example is already an +opportunity for mounting an easy denial-of-service attack: one just +has to send to the server a large enough string of a's. The newer +version of Java is better---it can match strings of around 50,000 +$a\,$'s in 30 seconds---however this would still not be good enough +for being a useful tool for network traffic analysis. What is +interesting is that even a relatively simple-minded regular expression +matcher based on Brzozowski derivatives can out-compete the regular +expressions matchers in Java and Python on such catastrophic +backtracking examples. This gain in speed is the motivation and +starting point for the proposed project. \smallskip + +\subsection*{Research Plan} + +Regular expressions are already an old and well-studied topic in +Computer Science. They were originally introduced by Kleene in 1951 +and are utilised in text search algorithms for ``find'' or ``find and +replace'' operations \cite{Brian,Kleene51}. Because of their wide +range of applications, many programming languages provide capabilities +for regular expression matching via libraries. The classic theory +of regular expressions involves just 6 different kinds of regular +expressions, often defined in terms of the following grammar: + +\begin{center}\small +\begin{tabular}{lcll} + $r$ & $::=$ & $\ZERO$ & cannot match anything\\ + & $|$ & $\ONE$ & can only match the empty string\\ + & $|$ & $c$ & can match a single character (in this case $c$)\\ + & $|$ & $r_1 + r_2$ & can match a string either with $r_1$ or with $r_2$\\ + & $|$ & $r_1\cdot r_2$ & can match the first part of a string with $r_1$ and\\ + & & & then the second part with $r_2$\\ + & $|$ & $r^*$ & can match zero or more times $r$\\ +\end{tabular} +\end{center} + +\noindent +For practical purposes, however, regular expressions have been +extended in various ways in order to make them even more powerful and +even more useful for applications. Some of the problems to do with +catastrophic backtracking stem, unfortunately, from these extensions. + +The crux in this project is to not use automata for regular expression +matching (the traditional technique), but to use derivatives of +regular expressions instead. These derivatives were introduced by +Brzozowski in 1964 \cite{Brzozowski1964}, but somehow had been lost +``in the sands of time'' \cite{Owens2009}. However, they recently have +become again a ``hot'' research topic with numerous research papers +appearing in the last five years. One reason for this interest is that +Brzozowski derivatives straightforwardly extend to more general +regular expression constructors, which cannot be easily treated with +standard techniques. They can also be implemented with ease in any +functional programming language: the definition for derivatives +consists of just two simple recursive functions shown in +Figure~\ref{f}. Moreover, it can be easily shown (by a mathematical +proof) that the resulting regular matcher is in fact always producing +a correct answer. This is an area where Urban can provide a lot of +expertise for this project: he is one of the main developers of the +Isabelle theorem prover, which is a program designed for such proofs +\cite{Isabelle}. + + +\begin{figure}[t] +\begin{center}\small +\begin{tabular}{lcl} +$\textit{nullable}(\ZERO)$ & $\dn$ & $\textit{false}$\\ +$\textit{nullable}(\ONE)$ & $\dn$ & $\textit{true}$\\ +$\textit{nullable}(c)$ & $\dn$ & $\textit{false}$\\ +$\textit{nullable}(r_1 + r_2)$ & $\dn$ & $\textit{nullable}(r_1) \vee \textit{nullable}(r_2)$\\ +$\textit{nullable}(r_1 \cdot r_2)$ & $\dn$ & $\textit{nullable}(r_1) \wedge \textit{nullable}(r_2)$\\ +$\textit{nullable}(r^*)$ & $\dn$ & $\textit{true}$\medskip\\ + +$\textit{der}\;c\;(\ZERO)$ & $\dn$ & $\ZERO$\\ +$\textit{der}\;c\;(\ONE)$ & $\dn$ & $\ZERO$\\ +$\textit{der}\;c\;(d)$ & $\dn$ & $\textit{if}\; c = d\;\textit{then} \;\ONE \; \textit{else} \;\ZERO$\\ +$\textit{der}\;c\;(r_1 + r_2)$ & $\dn$ & $(\textit{der}\;c\;r_1) + (\textit{der}\;c\;r_2)$\\ +$\textit{der}\;c\;(r_1 \cdot r_2)$ & $\dn$ & $\textit{if}\;\textit{nullable}(r_1)$\\ + & & $\textit{then}\;((\textit{der}\;c\;r_1)\cdot r_2) + (\textit{der}\;c\;r_2)$\\ + & & $\textit{else}\;(\textit{der}\;c\;r_1)\cdot r_2$\\ +$\textit{der}\;c\;(r^*)$ & $\dn$ & $(\textit{der}\;c\;r)\cdot (r^*)$\\[-5mm] +\end{tabular} +\end{center} +\caption{The complete definition of derivatives of regular expressions + \cite{Brzozowski1964}. + This definition can be implemented with ease in functional programming languages and can be easily extended to regular expressions used in practice. + The are more flexible for applications and easier to manipulate in + mathematical proofs, than traditional techniques for regular expression + matching.\medskip\label{f}} +\end{figure} + +There are a number of avenues for research on Brzozowski derivatives. +One problem I like to immediately tackle in this project is how to +handle \emph{back-references} in regular expressions. It is known that +the inclusion of back-references causes the underlying algorithmic +problem to become NP-hard \cite{Aho}, and is the main reason why +regular expression matchers suffer from the catastrophic backtracking +problem. However, the full generality of back-references are not +needed in practical applications. The goal therefore is to sufficiently +restrict the problem so that an efficient algorithm can be +designed. There seem to be good indications that Brzozowski +derivatives are useful for that. + +Another problem is \emph{how} regular expressions match a string +\cite{Sulzmann2014}. In this case the algorithm does not just give a +yes/no answer about whether the regular expression matches the string, +but also generates a value that encodes which part of the string is +matched by which part of the regular expression. This is important in +applications, like network analysis where from a matching log entry a +specific computer address needs to be extracted. Also compilers make +extensive use of such an extension when they lex their source +programs, i.e.~break up code into word-like components. Again Ausaf +and Urban from King's made some initial progress about proving the +correctness of such an extension, but their algorithm is not yet fast +enough for practical purposes \cite{AusafDyckhoffUrban2016}. It would +be desirable to study optimisations that make the algorithm faster, +but preserve the correctness guaranties obtained by Ausaf and Urban. + +\mbox{}\\[-11mm] + +\subsubsection*{Conclusion} + +Much research has already gone into regular expressions, and one might +think they have been studied and implemented to death. But this is far +from the truth. In fact regular expressions have become a ``hot'' +research topic in recent years because of problems with the +traditional methods (in applications such as network traffic +analysis), but also because the technique of derivatives of regular +expression has been re-discovered. These derivatives provide an +alternative approach to regular expression matching. Their potential +lies in the fact that they are more flexible in implementations and +much easier to manipulate in mathematical proofs. Therefore I like to +research them in this project.\medskip + +\noindent {\bf Impact:} Regular expression matching is a core +technology in Computer Science with numerous applications. I will +focus on the area of network traffic analysis and also lexical +analysis. In these areas this project can have a quite large impact: +for example the program Bro has been downloaded at least 10,000 times and Snort +even has 5 million downloads and over 600,000 registered users +\cite{snort,bro}. Lexical analysis is a core component in every +compiler, which are the bedrock on which nearly all programming is +built on. + +% {\small +% \begin{itemize} +% \item[$\bullet$] \url{http://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016} +% \item[$\bullet$] \url{https://vimeo.com/112065252} +% \item[$\bullet$] \url{http://davidvgalbraith.com/how-i-fixed-atom/} +% \end{itemize}} +\mbox{}\\[-11mm] + +\small +\bibliographystyle{plain} +\renewcommand{\refname}{References\\[-7mm]\mbox{}} +\bibliography{proposal} + +\end{document} + + +%%% Local Variables: +%%% mode: latex +%%% TeX-master: t +%%% End: diff -r bfab5aded21d -r c4d821c6309d text/style.sty --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/text/style.sty Tue May 15 10:24:25 2018 +0100 @@ -0,0 +1,79 @@ +\usepackage{xcolor} +%%\usepackage{fontspec} +\usepackage[sc]{mathpazo} +\usepackage{fontspec} +\setmainfont[Ligatures=TeX]{Palatino Linotype} +\usepackage{amssymb} +\usepackage{amsmath} +\usepackage{menukeys} +\definecolor{darkblue}{rgb}{0,0,0.6} +\usepackage[colorlinks=true,urlcolor=darkblue,linkcolor=darkblue]{hyperref} + +%%% for regular expressions and values +\newcommand{\ZERO}{\mbox{\bf 0}} +\newcommand{\ONE}{\mbox{\bf 1}} +\newcommand{\Left}{\textit{Left}} +\newcommand{\Der}{\textit{Der}} +\newcommand{\der}{\textit{der}} +\newcommand{\Ders}{\textit{Ders}} +\newcommand{\ders}{\textit{ders}} + +%%% for trees +%% http://anorien.csc.warwick.ac.uk/mirrors/CTAN/graphics/pgf/contrib/forest/forest.pdf + +\newcommand\grid[1]{% +\begin{tikzpicture}[baseline=(char.base)] + \path[use as bounding box] + (0,0) rectangle (1em,1em); + \draw[red!50, fill=red!20] + (0,0) rectangle (1em,1em); + \node[inner sep=1pt,anchor=base west] + (char) at (0em,\gridraiseamount) {#1}; +\end{tikzpicture}} +\newcommand\gridraiseamount{0.12em} + +\makeatletter +\newcommand\Grid[1]{% + \@tfor\z:=#1\do{\grid{\z}}} +\makeatother + +\newcommand\Vspace[1][.3em]{% + \mbox{\kern.06em\vrule height.3ex}% + \vbox{\hrule width#1}% + \hbox{\vrule height.3ex}} + +\def\VS{\Vspace[0.6em]} + + +\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}} +\newcommand{\defn}[1]{\textit{\textbf{#1}}} +\newcommand{\dq}[1]{\mbox{\tt{"}}#1\mbox{\tt{"}}} + +\definecolor{codegray}{gray}{0.9} + +\makeatletter +\def\fnote{\gdef\@thefnmark{}\@footnotetext} +\makeatother + +\newcommand{\HEADER}{{\bf Please submit your solutions via email. Please submit +only ASCII text or PDFs. Every solution should be preceeded by the corresponding +question text, like: + +\begin{center} +\begin{tabular}{ll} +Q$n$: & \ldots a difficult question from me\ldots\\ +A: & \ldots an answer from you \ldots\\ +Q$n+1$ & \ldots another difficult question\ldots\\ +A: & \ldots another brilliant answer from you\ldots +\end{tabular} +\end{center} + +\noindent Solutions will only be accepted until 20th December! Please send only +one homework per email.}\bigskip} + +\newcommand{\POSTSCRIPT}{ +{\bf (Optional)} This question is for you to provide +regular feedback to me: for example +what were the most interesting, least interesting, or confusing +parts in this lecture? Any problems with my Scala code? Please +feel free to share any other questions or concerns.} diff -r bfab5aded21d -r c4d821c6309d thys/Exercises.thy --- a/thys/Exercises.thy Fri Jan 12 02:33:35 2018 +0000 +++ b/thys/Exercises.thy Tue May 15 10:24:25 2018 +0100 @@ -30,6 +30,8 @@ zeroable r1 \ zeroable r2 \ (atmostempty r1 \ atmostempty r2)" | "atmostempty (STAR r) = atmostempty r" + + fun somechars :: "rexp \ bool" where @@ -50,8 +52,7 @@ apply blast apply(auto) using Star_cstring -by (metis concat_eq_Nil_conv) - + by (metis concat_eq_Nil_conv) lemma atmostempty_correctness_aux: shows "atmostempty r \ \ somechars r" @@ -65,6 +66,33 @@ by(auto simp add: atmostempty_correctness_aux somechars_correctness) fun + leastsinglechar :: "rexp \ bool" +where + "leastsinglechar (ZERO) \ False" +| "leastsinglechar (ONE) \ False" +| "leastsinglechar (CHAR c) \ True" +| "leastsinglechar (ALT r1 r2) \ leastsinglechar r1 \ leastsinglechar r2" +| "leastsinglechar (SEQ r1 r2) \ + (if (zeroable r1 \ zeroable r2) then False + else ((nullable r1 \ leastsinglechar r2) \ (nullable r2 \ leastsinglechar r1)))" +| "leastsinglechar (STAR r) \ leastsinglechar r" + +lemma leastsinglechar_correctness: + "leastsinglechar r \ (\c. [c] \ L r)" + apply(induct r) + apply(simp) + apply(simp) + apply(simp) + prefer 2 + apply(simp) + apply(blast) + prefer 2 + apply(simp) + using Star.step Star_decomp apply fastforce + apply(simp add: Sequ_def zeroable_correctness nullable_correctness) + by (metis append_Nil append_is_Nil_conv butlast_append butlast_snoc) + +fun infinitestrings :: "rexp \ bool" where "infinitestrings (ZERO) = False" diff -r bfab5aded21d -r c4d821c6309d thys/Simplifying.thy --- a/thys/Simplifying.thy Fri Jan 12 02:33:35 2018 +0000 +++ b/thys/Simplifying.thy Tue May 15 10:24:25 2018 +0100 @@ -32,6 +32,7 @@ | "simp_ALT (r\<^sub>1, f\<^sub>1) (ZERO, f\<^sub>2) = (r\<^sub>1, F_LEFT f\<^sub>1)" | "simp_ALT (r\<^sub>1, f\<^sub>1) (r\<^sub>2, f\<^sub>2) = (ALT r\<^sub>1 r\<^sub>2, F_ALT f\<^sub>1 f\<^sub>2)" +(* where is ZERO *) fun simp_SEQ where "simp_SEQ (ONE, f\<^sub>1) (r\<^sub>2, f\<^sub>2) = (r\<^sub>2, F_SEQ1 f\<^sub>1 f\<^sub>2)" | "simp_SEQ (r\<^sub>1, f\<^sub>1) (ONE, f\<^sub>2) = (r\<^sub>1, F_SEQ2 f\<^sub>1 f\<^sub>2)" @@ -227,6 +228,144 @@ ultimately show "slexer r (c # s) = lexer r (c # s)" by (simp del: slexer.simps add: slexer_better_simp) qed -qed + qed + + +fun simp2_ALT where + "simp2_ALT ZERO r2 seen = (r2, seen)" +| "simp2_ALT r1 ZERO seen = (r1, seen)" +| "simp2_ALT r1 r2 seen = (ALT r1 r2, seen)" + +fun simp2_SEQ where + "simp2_SEQ ZERO r2 seen = (ZERO, seen)" +| "simp2_SEQ r1 ZERO seen = (ZERO, seen)" +| "simp2_SEQ ONE r2 seen = (r2, seen \ {r2})" +| "simp2_SEQ r1 ONE seen = (r1, seen \ {r1})" +| "simp2_SEQ r1 r2 seen = (SEQ r1 r2, seen \ {SEQ r1 r2})" + + + + +fun + simp2 :: "rexp \ rexp set \ rexp * rexp set" +where + "simp2 (ALT r1 r2) seen = + (let (r1s, seen1) = simp2 r1 seen in + let (r2s, seen2) = simp2 r2 seen1 + in simp2_ALT r1s r2s seen2)" +| "simp2 (SEQ r1 r2) seen = + (let (r1s, _) = simp2 r1 {} in + let (r2s, _) = simp2 r2 {} + in if (SEQ r1s r2s \ seen) then (ZERO, seen) + else simp2_SEQ r1s r2s seen)" +| "simp2 r seen = (if r \ seen then (ZERO, seen) else (r, seen \ {r}))" + + +lemma simp2_ALT[simp]: + shows "L (fst (simp2_ALT r1 r2 seen)) = L r1 \ L r2" + apply(induct r1 r2 seen rule: simp2_ALT.induct) + apply(simp_all) + done + +lemma test1: + shows "snd (simp2_ALT r1 r2 rs) = rs" + apply(induct r1 r2 rs rule: simp2_ALT.induct) + apply(auto) + done + +lemma test2: + shows "rs \ snd (simp2_SEQ r1 r2 rs)" + apply(induct r1 r2 rs rule: simp2_SEQ.induct) + apply(auto) + done + +lemma test3: + shows "rs \ snd (simp2 r rs)" + apply(induct r rs rule: simp2.induct) + apply(auto split: prod.split) + apply (metis set_mp test1) + by (meson set_mp test2) + +lemma test3a: + shows "\(L ` snd (simp2 r rs)) \ L(r) \ (\ (L ` rs))" + apply(induct r rs rule: simp2.induct) + apply(auto split: prod.split simp add: Sequ_def) + apply (smt UN_iff Un_iff set_mp test1) + + +lemma test: + assumes "(\r' \ rs. L r') \ L r" + shows "L(fst (simp2 r rs)) \ L(r)" + using assms + apply(induct r arbitrary: rs) + apply(simp only: simp2.simps) + apply(simp) + apply(simp only: simp2.simps) + apply(simp) + apply(simp only: simp2.simps) + apply(simp) + prefer 3 + apply(simp only: simp2.simps) + apply(simp) + prefer 2 + apply(simp only: simp2.simps) + apply(simp) + apply(subst prod.split) + apply(rule allI)+ + apply(rule impI) + apply(subst prod.split) + apply(rule allI)+ + apply(rule impI) + apply(simp) + apply(drule_tac x="rs" in meta_spec) + apply(simp) + apply(drule_tac x="x2" in meta_spec) + apply(simp) + apply(auto)[1] + apply(subgoal_tac "rs \ x2a") + prefer 2 + apply (metis order_trans prod.sel(2) test3) + + + apply(rule antisym) + prefer 2 + apply(simp) + apply(rule conjI) + apply(drule meta_mp) + prefer 2 + apply(simp) + apply(auto)[1] + apply(auto)[1] + + thm prod.split + apply(auto split: prod.split)[1] + apply(drule_tac x="rs" in meta_spec) + apply(drule_tac x="rs" in meta_spec) + apply(simp) + apply(rule_tac x="SEQ x1 x1a" in bexI) + apply(simp add: Sequ_def) + + apply(auto)[1] + apply(simp) + apply(subgoal_tac "L (fst (simp2_SEQ x1 x1a rs)) \ L x1 \ L x1a") + apply(frule_tac x="{}" in meta_spec) + apply(rotate_tac 1) + apply(frule_tac x="{}" in meta_spec) + apply(simp) + + + apply(rule_tac x="SEQ x1 x1a" in bexI) + apply(simp add: Sequ_def) + apply(auto)[1] + apply(simp) + using L.simps(2) apply blast + prefer 3 + apply(simp only: simp2.simps) + apply(simp) + using L.simps(3) apply blast + prefer 2 + apply(simp add: Sequ_def) + apply(auto)[1] + oops end \ No newline at end of file