hws/hw04.tex
author Christian Urban <christian.urban@kcl.ac.uk>
Sat, 09 Sep 2023 14:14:31 +0100
changeset 916 10f834eb0a9e
parent 893 54a483a33763
child 939 f85e784d3014
permissions -rw-r--r--
texupdate

\documentclass{article}
\usepackage{../style}
\usepackage{../graphics}

\begin{document}

\section*{Homework 4}

%%\HEADER

\begin{enumerate}

\item Given the regular expressions

\begin{center}
\begin{tabular}{ll}    
  1) & $(ab + a)\cdot (\ONE + b)$\\
  2) & $(aa + a)^*$\\
\end{tabular}
\end{center}

there are several values for how these regular expressions can
recognise the strings (for 1) $ab$ and (for 2) $aaa$. Give in each case
\emph{all} the values and indicate which one is the POSIX value.
  

\item If a regular expression $r$ does not contain any occurrence of $\ZERO$,  
  is it possible for $L(r)$ to be empty? Explain why, or give a proof.

  \solution{
    The property to prove is

    \begin{center}
    $P(r)$: If $r$ does not contain $\ZERO$, then $L(r) \not= \emptyset$.  
  \end{center}

  For this you have to now go through all cases.

  Case $r = 0$: $P(\ZERO)$ says: If $\ZERO$ does not contain $\ZERO$
  then \ldots. The premise is obviously false, so everything follows,
  in particular $L(r) \not= \emptyset$.\medskip

  Case $r = \ONE$ and $r = c$ are similar, just that the premise is
  true, but also $L(\ONE)$ and $L(c)$ are not empty. So we shown
  $L(r) \not= \emptyset$.\medskip

  Case $r = r_1 + r_2$: We know $P(r_1)$ and $P(r_2)$ as IHs. We need to show
  $P(r_1 + r_2)$: If $r_1 + r_2$ does not contain $\ZERO$, then $L(r_1 + r_2) \not= \emptyset$.

  If $r_1 + r_2$ does not contain $\ZERO$, then also $r_1$ does not contain $\ZERO$
  and $r_2$ does not contain $\ZERO$. So we can apply the two IHs $P(r_1)$ and $P(r_2)$,
  which allow us to infer that $L(r_1) \not= \emptyset$ and $L(r_2) \not= \emptyset$.
  But if this is the case, then also $L(r_1 + r_2) \not= \emptyset$, which is what we needed
  to show in this case.\medskip

  The other cases are similar.\bigskip


  This lemma essentially says that for basic regular expressions, if
  they do not match anything at all, they must contain $\ZERO$(s)
  somewhere.

}

\item Define the tokens and regular expressions for a language
  consisting of numbers, left-parenthesis $($, right-parenthesis $)$,
  identifiers and the operations $+$, $-$ and $*$. Can the following
  strings in this language be lexed?

  \begin{itemize}
  \item $(a + 3) * b$
  \item $)()++ -33$
  \item $(a / 3) * 3$
  \end{itemize}

  In case they can, can you give the corresponding token
  sequences.

\item Assume $r$ is nullable. Show that
  \[ 1 + r + r\cdot r \;\equiv\; r\cdot r
  \]

  holds.

  \solution{
    If $r$ is nullable, then $1 + r \equiv r$. With this you can replace

    \begin{align}
      (1 + r) + r\cdot r & \equiv  r + r\cdot r\\
                         & \equiv  r \cdot (1 + r)\\
                         & \equiv  r \cdot r
    \end{align}  

    where in (2) you pull out the ``factor'' $r$ (because $r_1 \cdot (r_2 + r_3) \equiv r_1 \cdot r_2 + r_1 \cdot r_3$).
  }
  

\item \textbf{(Deleted)} Assume that $s^{-1}$ stands for the operation of reversing a
  string $s$. Given the following \emph{reversing} function on regular
  expressions

  \begin{center}
    \begin{tabular}{r@{\hspace{1mm}}c@{\hspace{1mm}}l}
      $rev(\ZERO)$   & $\dn$ & $\ZERO$\\
      $rev(\ONE)$         & $\dn$ & $\ONE$\\
      $rev(c)$                      & $\dn$ & $c$\\
      $rev(r_1 + r_2)$        & $\dn$ & $rev(r_1) + rev(r_2)$\\
      $rev(r_1 \cdot r_2)$  & $\dn$ & $rev(r_2) \cdot rev(r_1)$\\
      $rev(r^*)$                   & $\dn$ & $rev(r)^*$\\
    \end{tabular}
  \end{center}

  and the set

  \begin{center}
    $Rev\,A \dn \{s^{-1} \;|\; s \in A\}$
  \end{center}

  prove whether

  \begin{center}
    $L(rev(r)) = Rev (L(r))$
  \end{center}

  holds.

\item Assume the delimiters for comments are
      \texttt{$\slash$*} and \texttt{*$\slash$}. Give a
      regular expression that can recognise comments of the
      form

  \begin{center}
    \texttt{$\slash$*~\ldots{}~*$\slash$} 
  \end{center}

      where the three dots stand for arbitrary characters, but
      not comment delimiters. (Hint: You can assume you are
      already given a regular expression written \texttt{ALL},
      that can recognise any character, and a regular
      expression \texttt{NOT} that recognises the complement
      of a regular expression.)
  
\item Simplify the regular expression

\[
(\ZERO \cdot (b \cdot c)) + 
((\ZERO \cdot c) + \ONE)
\]

      Does simplification always preserve the meaning of a
      regular expression? 
     
\item The Sulzmann \& Lu algorithm contains the function
      $mkeps$ which answers how a regular expression can match
      the empty string. What is the answer of $mkeps$ for the
      regular expressions:    

  \[
  \begin{array}{l}
  (\ZERO \cdot (b \cdot c)) + 
  ((\ZERO \cdot c) + \ONE)\\
    (a + \ONE) \cdot (\ONE + \ONE)\\
    a^*
  \end{array}
  \]

\item What is the purpose of the record regular expression in
      the Sulzmann \& Lu algorithm?

\item Recall the functions \textit{nullable} and
      \textit{zeroable}.  Define recursive functions
      \textit{atmostempty} (for regular expressions that match no
      string or only the empty string), \textit{somechars} (for
      regular expressions that match some non-empty string),
      \textit{infinitestrings} (for regular expressions that can match
      infinitely many strings).

      \solution{
        \textbf{zeroable}: The property is $z(r) \;\text{iff}\; L(r) = \emptyset$:

        \begin{align}
          z(\ZERO) &\dn true\\
          z(\ONE)  &\dn false\\
          z(c)     &\dn false\\
          z(r_1 + r_2) &\dn z(r_1) \wedge z(r_2)\\
          z(r_1 \cdot r_2) &\dn z(r_1) \vee z(r_2)\\
          z(r^*)  &\dn false
        \end{align}\bigskip

        \textbf{atmostempty}: The property is ``either $L(r) = \emptyset$ or $L(r) = \{[]\}$'', which
        is more formally $a(r) \;\text{iff}\; L(r) \subseteq \{[]\}$:

        \begin{align}
          a(\ZERO) &\dn true\\
          a(\ONE)  &\dn true\\
          a(c)     &\dn false\\
          a(r_1 + r_2) &\dn a(r_1) \wedge a(r_2)\\
          a(r_1 \cdot r_2) &\dn z(r_1) \vee z(r_2) \vee (a(r_1) \wedge a(r_2))\\
          a(r^*)  &\dn a(r)
        \end{align}

        For this it is good to remember the regex should either not
        match anything at all, or just the empty string.\bigskip

        \textbf{somechars}: The property is ``$L(r)$ must contain a string which is not the empty string'', which
        is more formally $s(r) \;\text{iff}\; \exists\,s. s \not= [] \wedge s \in L(r)$:

        \begin{align}
          s(\ZERO) &\dn false\\
          s(\ONE)  &\dn false\\
          s(c)     &\dn true\\
          s(r_1 + r_2) &\dn s(r_1) \vee s(r_2)\\
          s(r_1 \cdot r_2) &\dn (\neg z(r_1) \wedge s(r_2)) \;\vee\; (\neg z(r_2) \wedge s(r_1))\\
          s(r^*)  &\dn s(r)
        \end{align}

        Here the interesting case is $r_1 \cdot r_2$ where one essentially has to make sure
        that one of the regexes is not zeroable, because then the resulting regex cannot match any
        string.\bigskip

        \textbf{infinitestrings}: The property is
        $i(r) \;\text{iff}\; L(r)\;\text{is infinite}$:

        \begin{align}
          i(\ZERO) &\dn false\\
          i(\ONE)  &\dn false\\
          i(c)     &\dn false\\
          i(r_1 + r_2) &\dn i(r_1) \vee i(r_2)\\
          i(r_1 \cdot r_2) &\dn (\neg z(r_1) \wedge i(r_2)) \;\vee\; (\neg z(r_2) \wedge i(r_1))\\
          i(r^*)  &\dn \neg a(r)
        \end{align}

        Here the interesting bit is that as soon $r$ can match at least a single string, then $r^*$
        will match infinitely many strings.
        }

      
\item There are two kinds of automata that are generated for 
  regular expression matching---DFAs and NFAs. (1) Regular expression engines like
  the one in Python generate NFAs.  Explain what is the problem with such
  NFAs and what is the reason why they use NFAs. (2) Regular expression
  engines like the one in Rust generate DFAs. Explain what is the
  problem with these regex engines and also what is the problem with $a^{\{1000\}}$
  in these engines.
      
%\item (Optional) The tokenizer in \texttt{regexp3.scala} takes as
%argument a string and a list of rules. The result is a list of tokens. Improve this tokenizer so 
%that it filters out all comments and whitespace from the result.

%\item (Optional) Modify the tokenizer in \texttt{regexp2.scala} so that it
%implements the \texttt{findAll} function. This function takes a regular
%expressions and a string, and returns all substrings in this string that 
      %match the regular expression.

\item \POSTSCRIPT  
\end{enumerate}


\end{document}

%%% Local Variables: 
%%% mode: latex
%%% TeX-master: t
%%% End: