hws/hw03.tex
author Christian Urban <christian.urban@kcl.ac.uk>
Wed, 29 May 2024 13:25:30 +0100
changeset 960 c7009356ddd8
parent 943 5365ef60707e
permissions -rw-r--r--
updated

\documentclass{article}
\usepackage{../style}
\usepackage{../graphics}

\begin{document}

\section*{Homework 3}

%\HEADER

\begin{enumerate}
\item The regular expression matchers in Java, Python and Ruby can be
  very slow with some (basic) regular expressions. What is the main
  reason for this inefficient computation?

  \solution{Many matchers employ DFS type of algorithms to check
    if a string is matched by the regex or not. Such algorithms
    require backtracking if have gone down the wrong path which
    can be very slow. There are also problems with bounded regular
  expressions and backreferences.}
  
\item What is a regular language? Are there alternative ways
      to define this notion? If yes, give an explanation why
      they define the same notion.

      \solution{A regular language is a language for which every string
        can be recognized by some regular expression. Another definition is
        that it is a language for which a finite automaton can be
        constructed. Both define the same set of languages.}   

\item Why is every finite set of strings a regular language?

  \solution{Take a regex composed of all strings (works for finite languages)}
  
\item Assume you have an alphabet consisting of the letters
      $a$, $b$ and $c$ only. (1) Find a regular expression
      that recognises the two strings $ab$ and $ac$. (2) Find
      a regular expression that matches all strings
      \emph{except} these two strings. Note, you can only use
      regular expressions of the form
      
  \begin{center} $r ::=
    \ZERO \;|\; \ONE \;|\; c \;|\; r_1 + r_2 \;|\;
    r_1 \cdot r_2 \;|\; r^*$ 
  \end{center}

%\item Define the function \textit{zeroable} which takes a
%      regular expression as argument and returns a boolean.
%      The function should satisfy the following property:
%
%  \begin{center}
%    $\textit{zeroable(r)} \;\text{if and only if}\; 
%    L(r) = \{\}$
%  \end{center}

  \solution{Done in the video but there I forgot to include the empty string.}

\item Given the alphabet $\{a,b\}$. Draw the automaton that has two
  states, say $Q_0$ and $Q_1$.  The starting state is $Q_0$ and the
  final state is $Q_1$. The transition function is given by

  \begin{center}
    \begin{tabular}{l}
      $(Q_0, a) \rightarrow Q_0$\\
      $(Q_0, b) \rightarrow Q_1$\\
      $(Q_1, b) \rightarrow Q_1$
    \end{tabular}
  \end{center}

  What is the language recognised by this automaton?

  \solution{
    All strings consisting of 0 or more a's then 1 or more b's,
    which is equivalent to the language of the regular
    expression $a^* \cdot b \cdot b^*$.  
  }

\item Give a non-deterministic finite automaton that can
  recognise the language $L(a\cdot (a + b)^* \cdot c)$.

  \solution{It is already possible to just read off the automaton without
  going through Thompson.}

\item Given a deterministic finite automaton $A(\varSigma, Q, Q_0, F,
      \delta)$, define which language is recognised by this
      automaton. Can you define also the language defined by a
      non-deterministic automaton?


      \solution{
        A formula for DFAs is

        \[L(A) \dn \{s \;|\; \hat{\delta}(start_q, s) \in F\}\]

        For NFAs you need to first define what $\hat{\rho}$ means. If
        $\rho$ is given as a relation, you can define:

        \[
          \hat{\rho}(qs, []) \dn qs \qquad
          \hat{\rho}(qs, c::s) \dn \bigcup_{q\in qs} \hat{\rho}(\{ q' \; | \; \rho(q, c, q')\}, s)
        \]

        This ``collects'' all the states reachable in a breadth-first
        manner. Once you have all the states reachable by an NFA, you can define
        the language as

        \[
        L(N) \dn \{s \;|\; \hat{\rho}(qs_{start}, s) \cap F \not= \emptyset\}
        \]  

        Here you test whether the all states reachable (for $s$) contain at least
        a single accepting state.
        
      }

     

\item Given the following deterministic finite automaton over
      the alphabet $\{a, b\}$, find an automaton that
      recognises the complement language. (Hint: Recall that
      for the algorithm from the lectures, the automaton needs
      to be in completed form, that is have a transition for
      every letter from the alphabet.)

      \solution{
        Before exchanging accepting and non-accepting states, it is important that
        the automaton is completed (meaning has a transition for every letter
        of the alphabet). If not completed, you have to introduce a sink state.

        For fun you can try out the example without
        completion: Then the original automaton can recognise
        strings of the form $a$, $ab...b$; but the ``uncompleted'' automaton would
        recognise only the empty string.
      }

  \begin{center}
    \begin{tikzpicture}[>=stealth',very thick,auto,
                        every state/.style={minimum size=0pt,
                        inner sep=2pt,draw=blue!50,very thick,
                        fill=blue!20},scale=2]
      \node[state, initial]        (q0) at ( 0,1) {$Q_0$};
      \node[state, accepting]  (q1) at ( 1,1) {$Q_1$};
      \path[->] (q0) edge node[above] {$a$} (q1)
                (q1) edge [loop right] node {$b$} ();
    \end{tikzpicture}
  \end{center}



%\item Given the following deterministic finite automaton
%
%\begin{center}
%\begin{tikzpicture}[scale=3, line width=0.7mm]
%  \node[state, initial]        (q0) at ( 0,1) {$q_0$};
%  \node[state,accepting]  (q1) at ( 1,1) {$q_1$};
%  \node[state, accepting] (q2) at ( 2,1) {$q_2$};
%  \path[->] (q0) edge node[above] {$b$} (q1)
%                  (q1) edge [loop above] node[above] {$a$} ()
%                  (q2) edge [loop above] node[above] {$a, b$} ()
%                  (q1) edge node[above] {$b$} (q2)
%                  (q0) edge[bend right] node[below] {$a$} (q2)
%                  ;
%\end{tikzpicture}
%\end{center}
%find the corresponding minimal automaton. State clearly which nodes
%can be merged.

\item Given the following non-deterministic finite automaton
      over the alphabet $\{a, b\}$, find a deterministic
      finite automaton that recognises the same language:

  \begin{center}
    \begin{tikzpicture}[>=stealth',very thick,auto,
                        every state/.style={minimum size=0pt,
                        inner sep=2pt,draw=blue!50,very thick,
                        fill=blue!20},scale=2]
      \node[state, initial]        (q0) at ( 0,1) {$Q_0$};
      \node[state]                    (q1) at ( 1,1) {$Q_1$};
      \node[state, accepting] (q2) at ( 2,1) {$Q_2$};
      \path[->] (q0) edge node[above] {$a$} (q1)
                (q0) edge [loop above] node[above] {$b$} ()
                (q0) edge [loop below] node[below] {$a$} ()
                (q1) edge node[above] {$a$} (q2);
    \end{tikzpicture}
  \end{center}

   \solution{
        The DFA has three states Q0,Q1,Q2 with Q0 starting state and Q2 accepting.
        The transitions are (Q0,a)-> Q1 (Q0,b)->Q0 (Q1,a)->Q2 (Q1,b)->Q0
        (Q2,a)->Q2 (Q2,b)->Q0.
        }
  
\item %%\textbf{(Deleted for 2017, 2018, 2019)}
  Given the following deterministic finite automaton over the
  alphabet $\{0, 1\}$, find the corresponding minimal automaton. In
  case states can be merged, state clearly which states can be merged.

  \begin{center}
    \begin{tikzpicture}[>=stealth',very thick,auto,
                        every state/.style={minimum size=0pt,
                        inner sep=2pt,draw=blue!50,very thick,
                        fill=blue!20},scale=2]
      \node[state, initial]        (q0) at ( 0,1) {$Q_0$};
      \node[state]                    (q1) at ( 1,1) {$Q_1$};
      \node[state, accepting] (q4) at ( 2,1) {$Q_4$};
      \node[state]                    (q2) at (0.5,0) {$Q_2$};
      \node[state]                    (q3) at (1.5,0) {$Q_3$};
      \path[->] (q0) edge node[above] {$0$} (q1)
                (q0) edge node[right] {$1$} (q2)
                (q1) edge node[above] {$0$} (q4)
                (q1) edge node[right] {$1$} (q2)
                (q2) edge node[above] {$0$} (q3)
                (q2) edge [loop below] node {$1$} ()
                (q3) edge node[left] {$0$} (q4)
                (q3) edge [bend left=95, looseness = 2.2] node [left=2mm] {$1$} (q0)
                (q4) edge [loop right] node {$0, 1$} ();
    \end{tikzpicture}
  \end{center}

  \solution{Q0 and Q2 can be merged; and Q1 and Q3 as well}

\item Given the following finite deterministic automaton over the alphabet $\{a, b\}$:

  \begin{center}
    \begin{tikzpicture}[scale=2,>=stealth',very thick,auto,
                        every state/.style={minimum size=0pt,
                        inner sep=2pt,draw=blue!50,very thick,
                        fill=blue!20}]
      \node[state, initial, accepting]        (q0) at ( 0,1) {$Q_0$};
      \node[state, accepting]                    (q1) at ( 1,1) {$Q_1$};
      \node[state] (q2) at ( 2,1) {$Q_2$};
      \path[->] (q0) edge[bend left] node[above] {$a$} (q1)
                (q1) edge[bend left] node[above] {$b$} (q0)
                (q2) edge[bend left=50] node[below] {$b$} (q0)
                (q1) edge node[above] {$a$} (q2)
                (q2) edge [loop right] node {$a$} ()
                (q0) edge [loop below] node {$b$} ()
            ;
    \end{tikzpicture}
  \end{center}

  Give a regular expression that can recognise the same language as
  this automaton. (Hint: If you use Brzozwski's method, you can assume
  Arden's lemma which states that an equation of the form $q = q\cdot r + s$
  has the unique solution $q = s \cdot r^*$.)

  \solution{
    $(b + ab + aa(a^*)b)^* \cdot (1 + a)$
    }

\item If a non-deterministic finite automaton (NFA) has
  $n$ states. How many states does a deterministic 
  automaton (DFA) that can recognise the same language
  as the NFA maximal need?

  \solution{$2^n$ in the worst-case and for some regexes the worst case
    cannot be avoided. 

    Other comments: $r^{\{n\}}$ can only be represented as $n$
    copies of the automaton for $r$, which can explode the automaton for bounded
    regular expressions. Similarly, we have no idea how backreferences can be
    represented as automaton.
  }

\item Rust implements a non-backtracking regular expression matcher
  based on the classic idea of DFAs. Still, some regular expressions
  take a surprising amount of time for matching problems. Explain the
  problem?

  \solution{The problem has to do with bounded regular expressions,
    such as $r^{\{n\}}$. They are represented as $n$-copies of some
    automaton for $r$. If $n$ is large, then this can result in a
    large memory-footprint and slow runtime.}

\item Prove that for all regular expressions $r$ we have
      
\begin{center} 
  $\textit{nullable}(r) \quad \text{if and only if} 
  \quad [] \in L(r)$ 
\end{center}

      Write down clearly in each case what you need to prove
      and what are the assumptions. 

  
\item \POSTSCRIPT  
\end{enumerate}

\end{document}

%%% Local Variables: 
%%% mode: latex
%%% TeX-master: t
%%% End: