# HG changeset patch # User Christian Urban # Date 1490280566 0 # Node ID acd8780bfc8b862844d15757aed407464339f596 # Parent 9e42ccbbd1e65f1cc9ac65820a17291c183fa7a8# Parent 52aa298211f69c59f3f4211c66f7cec1b19c13d9 merged diff -r 52aa298211f6 -r acd8780bfc8b handouts/ho02.tex --- a/handouts/ho02.tex Fri Mar 17 12:15:58 2017 +0000 +++ b/handouts/ho02.tex Thu Mar 23 14:49:26 2017 +0000 @@ -3,8 +3,7 @@ \usepackage{../langs} \usepackage{../graphics} \usepackage{../data} -\usepackage{lstlinebgrd} -\definecolor{capri}{rgb}{0.0, 0.75, 1.0} + \begin{document} \fnote{\copyright{} Christian Urban, King's College London, 2014, 2015, 2016, 2017} diff -r 52aa298211f6 -r acd8780bfc8b handouts/ho03.pdf Binary file handouts/ho03.pdf has changed diff -r 52aa298211f6 -r acd8780bfc8b handouts/ho03.tex --- a/handouts/ho03.tex Fri Mar 17 12:15:58 2017 +0000 +++ b/handouts/ho03.tex Thu Mar 23 14:49:26 2017 +0000 @@ -4,26 +4,32 @@ \usepackage{../graphics} +%We can even allow ``silent +%transitions'', also called epsilon-transitions. They allow us +%to go from one state to the next without having a character +%consumed. We label such silent transition with the letter +%$\epsilon$. + \begin{document} +\fnote{\copyright{} Christian Urban, King's College London, 2014, 2015, 2016, 2017} \section*{Handout 3 (Automata)} -Every formal language course I know of bombards you first with -automata and then to a much, much smaller extend with regular -expressions. As you can see, this course is turned upside -down: regular expressions come first. The reason is that -regular expressions are easier to reason about and the notion -of derivatives, although already quite old, only became more -widely known rather recently. Still let us in this lecture -have a closer look at automata and their relation to regular -expressions. This will help us with understanding why the -regular expression matchers in Python, Ruby and Java are so slow -with certain regular expressions. The central definition -is:\medskip +Every formal language and compiler course I know of bombards you first +with automata and then to a much, much smaller extend with regular +expressions. As you can see, this course is turned upside down: +regular expressions come first. The reason is that regular expressions +are easier to reason about and the notion of derivatives, although +already quite old, only became more widely known rather +recently. Still let us in this lecture have a closer look at automata +and their relation to regular expressions. This will help us with +understanding why the regular expression matchers in Python, Ruby and +Java are so slow with certain regular expressions. The central +definition is:\medskip \noindent A \emph{deterministic finite automaton} (DFA), say $A$, is -defined by a four-tuple written $A(Q, q_0, F, \delta)$ where +given by a four-tuple written $A(Q, q_0, F, \delta)$ where \begin{itemize} \item $Q$ is a finite set of states, @@ -97,36 +103,86 @@ \] \noindent This lifted transition function is often called -``delta-hat''. Given a string, we start in the starting state -and take the first character of the string, follow to the next -sate, then take the second character and so on. Once the -string is exhausted and we end up in an accepting state, then -this string is accepted by the automaton. Otherwise it is not -accepted. So $s$ is in the \emph{language accepted by the -automaton} $A(Q, q_0, F, \delta)$ iff +``delta-hat''. Given a string, we start in the starting state and take +the first character of the string, follow to the next sate, then take +the second character and so on. Once the string is exhausted and we +end up in an accepting state, then this string is accepted by the +automaton. Otherwise it is not accepted. This also means that if along +the way we hit the case where the transition function $\delta$ is not +defined, we need to raise an error. In our implementation we will deal +with this case elegantly by using Scala's \texttt{Try}. So $s$ is in +the \emph{language accepted by the automaton} $A(Q, q_0, F, \delta)$ +iff \[ \hat{\delta}(q_0, s) \in F \] \noindent I let you think about a definition that describes -the set of strings accepted by an automaton. - +the set of strings accepted by an automaton. + +\begin{figure}[p] +\small +\lstinputlisting[numbers=left,linebackgroundcolor= + {\ifodd\value{lstnumber}\color{capri!3}\fi}] + {../progs/dfa.scala} +\caption{Scala implementation of DFAs using partial functions. + Notice some subtleties: \texttt{deltas} implements the delta-hat + construction by lifting the transition (partial) function to + lists of \texttt{C}haracters. Since \texttt{delta} is given + as partial function, it can obviously go ``wrong'' in which + case the \texttt{Try} in \texttt{accepts} catches the error and + returns \texttt{false}, that is the string is not accepted. + The example implements the DFA example shown above.\label{dfa}} +\end{figure} + +A simple Scala implementation of DFAs is given in Figure~\ref{dfa}. As +you can see, there many features of the mathematical definition that +are quite closely reflected in the code. In the DFA-class, there is a +starting state, called \texttt{start}, with the polymorphic type +\texttt{A}. There is a partial function \texttt{delta} for specifying +the transitions. I used partial functions for transitions in Scala; +alternatives would have been \texttt{Maps} or even \texttt{Lists}. One +of the main advantages of using partial functions is that transitions +can be quite nicely defined by a series of \texttt{case} statements +(see Lines 28 -- 38). If you need to represent an automaton +with a sink state (catch-all-state), you can write something like -While with DFAs it will always be clear that given a character -what the next state is (potentially none), it will be useful -to relax this restriction. That means we have several -potential successor states. We even allow ``silent -transitions'', also called epsilon-transitions. They allow us -to go from one state to the next without having a character -consumed. We label such silent transition with the letter -$\epsilon$. The resulting construction is called a -\emph{non-deterministic finite automaton} (NFA) given also as -a four-tuple $A(Q, q_0, F, \rho)$ where +{\small\begin{lstlisting}[language=Scala,linebackgroundcolor= + {\ifodd\value{lstnumber}\color{capri!3}\fi}] +abstract class State +... +case object Sink extends State + +val delta : (State, Char) :=> State = + { case (S0, 'a') => S1 + case (S1, 'a') => S2 + case _ => Sink + } +\end{lstlisting}} + +\noindent The DFA-class has also an argument for final states. It is a +function from states to booleans (returns true whenever a state is +meant to be finbal; false otherwise). While this is a boolean +function, Scala allows me to use sets of states for such functions +(see Line 40 where I initialise the DFA). + +I let you ponder whether this is a good implementation of DFAs. In +doing so I hope you notice that the $Q$ component (the set of finite +states) is missing from the class definition. This means that the +implementation allows you to do some fishy things you are not meant to +do with DFAs. Which fishy things could that be? + +While with DFAs it will always be clear that given a character what +the next state is (potentially none), it will be useful to relax this +restriction. That means we have several potential successor states. We +even allow more than one starting state. The resulting construction is +called a \emph{non-deterministic finite automaton} (NFA) given also as +a four-tuple $A(Q, Q_{0s}, F, \rho)$ where \begin{itemize} \item $Q$ is a finite set of states -\item $q_0$ is a start state +\item $Q_{0s}$ are the start states ($Q_{0s} \subseteq Q$) \item $F$ are some accepting states with $F \subseteq Q$, and \item $\rho$ is a transition relation. \end{itemize} diff -r 52aa298211f6 -r acd8780bfc8b langs.sty --- a/langs.sty Fri Mar 17 12:15:58 2017 +0000 +++ b/langs.sty Thu Mar 23 14:49:26 2017 +0000 @@ -68,3 +68,9 @@ %%\lstset{escapeinside={(*@}{@*)}} \lstset{escapeinside={/*@}{@*/}} + + + +%% stripy code +\usepackage{lstlinebgrd} +\definecolor{capri}{rgb}{0.0, 0.75, 1.0} diff -r 52aa298211f6 -r acd8780bfc8b progs/dfa.scala --- a/progs/dfa.scala Fri Mar 17 12:15:58 2017 +0000 +++ b/progs/dfa.scala Thu Mar 23 14:49:26 2017 +0000 @@ -1,132 +1,43 @@ -// convert normal string to hex bytes string -def string2hex(str: String): String = { - str.toList.map(_.toInt.toHexString).mkString -} - -// convert hex bytes string to normal string +// DFAs in Scala based on partial functions +import scala.util.Try +// type abbreviation for partial functions +type :=>[A, B] = PartialFunction[A, B] -val byteArray: Array[Byte] = List.range(1, 255).map(_.toByte).toArray -val bos = new BufferedOutputStream(new FileOutputStream("test.txt")) -bos.write(byteArray) -bos.close() - +case class DFA[A, C](start: A, // starting state + delta: (A, C) :=> A, // transition partial fun + fins: A => Boolean) { // final states -def string2int(hex: String) = { - hex.sliding(2, 2).map(Integer.parseInt(_, 16)).toArray -} + def deltas(q: A, s: List[C]) : A = s match { + case Nil => q + case c::cs => deltas(delta(q, c), cs) + } -def test(l: List[Int]) = { - l.map(_.toChar).mkString + def accepts(s: List[C]) : Boolean = + Try(fins(deltas(start, s))) getOrElse false } - -import java.io._ -val writer = new PrintWriter(new File("test.txt" )) - -//writer.write(string2int("cafebabe")) -writer.write(test(List.range(1, 255))) -writer.close() - -import scalax.io._ -import scalax.io.Resource - -FileOutputStream fos = new FileOutputStream("test"); -fos.write(bytesArray); -fos.close(); - - -string2int("cafebabe") - -202.toHexString - -"cafebabe".sliding(1, 1).toList.map(Integer.parseInt(_, 16)).map(_.toByte).map(_.toChar) - -hex2string("ca") -string2hex("ca") -hex2string("cafebabe") - - -val appkey = "9GLV//lv/kYFW2o3/bihxwnMcmo=" - - // string to hex - val appkey_hex = string2hex(appkey) - // 39474c562f2f6c762f6b594657326f332f62696878776e4d636d6f3d - println(appkey_hex) - - // hex to string - val appkey_string_again = hex2string(appkey_hex) - // 9GLV//lv/kYFW2o3/bihxwnMcmo= - println(appkey_string_again) - } - - -List("ca", "fe", "ba", "be").map(_.toByte) - -"ca".toByte - - -// DFAs -import scala.util._ - +// the example shown earlier in the handout abstract class State -type States = Set[State] - -case class IntState(i: Int) extends State +case object Q0 extends State +case object Q1 extends State +case object Q2 extends State +case object Q3 extends State +case object Q4 extends State -object NewState { - var counter = 0 - - def apply() : IntState = { - counter += 1; - new IntState(counter - 1) - } -} - - -// DFA class -case class DFA(states: States, - start: State, - delta: (State, Char) => State, - fins: States) { - - def deltas(q: State, s: List[Char]) : State = s match { - case Nil => q - case c::cs => deltas(delta(q, c), cs) - } - - // wether a string is accepted by the automaton or not - def accepts(s: String) : Boolean = - Try(fins contains - (deltas(start, s.toList))) getOrElse false -} - +val delta : (State, Char) :=> State = + { case (Q0, 'a') => Q1 + case (Q0, 'b') => Q2 + case (Q1, 'a') => Q4 + case (Q1, 'b') => Q2 + case (Q2, 'a') => Q3 + case (Q2, 'b') => Q2 + case (Q3, 'a') => Q4 + case (Q3, 'b') => Q0 + case (Q4, 'a') => Q4 + case (Q4, 'b') => Q4 } -// example DFA from the lectures -val Q0 = NewState() -val Q1 = NewState() -val Q2 = NewState() -val Q3 = NewState() -val Q4 = NewState() - +val dfa = DFA(Q0, delta, Set[State](Q4)) -val delta : (State, Char) => State = { - case (Q0, 'a') => Q1 - case (Q0, 'b') => Q2 - case (Q1, 'a') => Q4 - case (Q1, 'b') => Q2 - case (Q2, 'a') => Q3 - case (Q2, 'b') => Q2 - case (Q3, 'a') => Q4 - case (Q3, 'b') => Q0 - case (Q4, 'a') => Q4 - case (Q4, 'b') => Q4 -} - -val DFA1 = DFA(Set(Q0, Q1, Q2, Q3, Q4), Q0, delta, Set(Q4)) - -println(DFA1.accepts("aaa")) -println(DFA1.accepts("bb")) -println(DFA1.accepts("aaac")) - - +dfa.accepts("bbabaab".toList) // true +dfa.accepts("baba".toList) // false diff -r 52aa298211f6 -r acd8780bfc8b progs/re1.scala --- a/progs/re1.scala Fri Mar 17 12:15:58 2017 +0000 +++ b/progs/re1.scala Thu Mar 23 14:49:26 2017 +0000 @@ -77,6 +77,7 @@ (end - start)/(i * 1.0e9) } + //test: (a?{n}) (a{n}) for (i <- 1 to 20) { println(i + ": " + "%.5f".format(time_needed(2, matches(EVIL1(i), "a" * i))))