hws/hw04.tex
author Christian Urban <christian.urban@kcl.ac.uk>
Sat, 21 Oct 2023 09:09:09 +0100
changeset 943 5365ef60707e
parent 939 f85e784d3014
permissions -rw-r--r--
updated

\documentclass{article}
\usepackage{../style}
\usepackage{../graphics}

\begin{document}

\section*{Homework 4}

%%\HEADER

\begin{enumerate}

\item Given the regular expressions

\begin{center}
\begin{tabular}{ll}    
  1) & $(ab + a)\cdot (\ONE + b)$\\
  2) & $(aa + a)^*$\\
\end{tabular}
\end{center}

there are several values for how these regular expressions can
recognise the strings (for 1) $ab$ and (for 2) $aaa$. Give in each case
\emph{all} the values and indicate which one is the POSIX value.

\solution{
  1) There are only 2 values (writing $a$ for $Char(a)$ and so on)

  \begin{center}
  \begin{tabular}{l}
    $Sequ(Left(Sequ(a,b)),Left(Empty))$\\
    $Sequ(Right(a),Left(b))$\\ 
  \end{tabular}    
  \end{center}
  
  The first is the POSIX value because of the preference for $Left$.

  2) There are three ``main'' values, namely

  \begin{center}
  \begin{tabular}{l}
    $Stars\,[Left(Sequ(a,a)),Right(a)]$\\
    $Stars\,[Right(a), Left(Sequ(a,a))]$\\
    $Stars\,[Right(a), Right(a), Right(a)]$\\
  \end{tabular}    
  \end{center}

  Again the first one is the POSIX value, but if it just about all
  possible values, then there are in fact infinitely many values because
  the following 

  \begin{center}
  \begin{tabular}{l}
    $Stars\,[Left(Sequ(a,a)),Empty,Right(a)]$\\
    $Stars\,[Left(Sequ(a,a)),Empty,Empty,Right(a)]$\\
    $Stars\,[Left(Sequ(a,a)),Empty,Right(a), Empty]$, \ldots\\
  \end{tabular}    
  \end{center}

  are also values for this regex and the string $aaa$.
}  

\item If a regular expression $r$ does not contain any occurrence of $\ZERO$,  
  is it possible for $L(r)$ to be empty? Explain why, or give a proof.

  \solution{
    No. The property to prove by induction is

    \begin{center}
    $P(r)$: If $r$ does not contain $\ZERO$, then $L(r) \not= \emptyset$.  
  \end{center}

  For this you have to now go through all cases.

  Case $r = 0$: $P(\ZERO)$ says: If $\ZERO$ does not contain $\ZERO$
  then \ldots. The premise is obviously false, so everything follows,
  in particular $L(r) \not= \emptyset$.\medskip

  Case $r = \ONE$ and $r = c$ are similar, just that the premise is
  true, but also $L(\ONE)$ and $L(c)$ are not empty. So we shown
  $L(r) \not= \emptyset$.\medskip

  Case $r = r_1 + r_2$: We know $P(r_1)$ and $P(r_2)$ as IHs. We need to show
  $P(r_1 + r_2)$: If $r_1 + r_2$ does not contain $\ZERO$, then $L(r_1 + r_2) \not= \emptyset$.

  If $r_1 + r_2$ does not contain $\ZERO$, then also $r_1$ does not contain $\ZERO$
  and $r_2$ does not contain $\ZERO$. So we can apply the two IHs $P(r_1)$ and $P(r_2)$,
  which allow us to infer that $L(r_1) \not= \emptyset$ and $L(r_2) \not= \emptyset$.
  But if this is the case, then also $L(r_1 + r_2) \not= \emptyset$, which is what we needed
  to show in this case.\medskip

  The other cases are similar.\bigskip


  This lemma essentially says that for basic regular expressions, if
  they do not match anything at all, they must contain $\ZERO$(s)
  somewhere.

}

\item Define the tokens and regular expressions for a language
  consisting of numbers, left-parenthesis $($, right-parenthesis $)$,
  identifiers and the operations $+$, $-$ and $*$. Can the following
  strings in this language be lexed?

  \begin{itemize}
  \item $(a + 3) * b$
  \item $)()++ -33$
  \item $(a / 3) * 3$
  \end{itemize}

  In case they can, can you give the corresponding token
  sequences.

  \solution{
  The first 2 are lexibile. The 3 one contains $/$ which is not an operator.
  }  

\item Assume $r$ is nullable. Show that
  \[ 1 + r + r\cdot r \;\equiv\; r\cdot r
  \]

  holds.

  \solution{
    If $r$ is nullable, then $1 + r \equiv r$. With this you can replace

    \begin{align}
      (1 + r) + r\cdot r & \equiv  r + r\cdot r\\
                         & \equiv  r \cdot (1 + r)\\
                         & \equiv  r \cdot r
    \end{align}  

    where in (2) you pull out the ``factor'' $r$ (because $r_1 \cdot (r_2 + r_3) \equiv r_1 \cdot r_2 + r_1 \cdot r_3$).
  }
  

%\item \textbf{(Deleted)} Assume that $s^{-1}$ stands for the operation of reversing a
%  string $s$. Given the following \emph{reversing} function on regular
%  expressions
%
%  \begin{center}
%    \begin{tabular}{r@{\hspace{1mm}}c@{\hspace{1mm}}l}
%      $rev(\ZERO)$   & $\dn$ & $\ZERO$\\
%      $rev(\ONE)$         & $\dn$ & $\ONE$\\
%      $rev(c)$                      & $\dn$ & $c$\\
%      $rev(r_1 + r_2)$        & $\dn$ & $rev(r_1) + rev(r_2)$\\
%      $rev(r_1 \cdot r_2)$  & $\dn$ & $rev(r_2) \cdot rev(r_1)$\\
%      $rev(r^*)$                   & $\dn$ & $rev(r)^*$\\
%    \end{tabular}
%  \end{center}
%
%  and the set
%
%  \begin{center}
%    $Rev\,A \dn \{s^{-1} \;|\; s \in A\}$
%  \end{center}
%
%  prove whether
%
%  \begin{center}
%    $L(rev(r)) = Rev (L(r))$
%  \end{center}
%
%  holds.

\item Construct a regular expression that can validate passwords. A
  password should be at least 8 characters long and consist of upper-
  and lower-case letters and digits. It should contain at least a
  single lower-case letter, at least a single upper-case letter and at
  least a single digit. If possible use the intersection regular
  expression from CW1, written $\_\&\_$, and the bounded regular
  expressions; you can also assume a regular expression written
  \texttt{ALL} that can match any character.

  \solution{
    You can build-up the different constraints separately and then
    use the intersection operator:

  \begin{center}  
  \begin{tabular}{lll}  
    $ALL^{\{8..\}}$ & \;\&\; & $(ALL^*\cdot [a-z]\cdot ALL^*)$\\
                   & \;\&\; & $(ALL^*\cdot [A-Z]\cdot ALL^*)$\\
                   & \;\&\; & $(ALL^*\cdot [0-9]\cdot ALL^*)$\\
  \end{tabular}
  \end{center}

  $ALL$ could be represented as $\sim \ZERO$.
  }
  
\item Assume the delimiters for comments are
      \texttt{$\slash$*} and \texttt{*$\slash$}. Give a
      regular expression that can recognise comments of the
      form

  \begin{center}
    \texttt{$\slash$*~\ldots{}~*$\slash$} 
  \end{center}

      where the three dots stand for arbitrary characters, but
      not comment delimiters. (Hint: You can assume you are
      already given a regular expression written \texttt{ALL},
      that can recognise any character, and a regular
      expression \texttt{NOT} that recognises the complement
      of a regular expression.)

      \solution{
        \[/ * \sim (ALL^* * / ALL^*) * /\]

      The idea to make sure in between $/ *$ and $* /$ ar no strings that contain $* /$.  
      }
      
\item Simplify the regular expression

\[
(\ZERO \cdot (b \cdot c)) + 
((\ZERO \cdot c) + \ONE)
\]

      Does simplification always preserve the meaning of a
      regular expression?

      \solution{ Yes, simplification preserves the language. It
        simplifies to just $\ONE$. It should be remembered that the
        Brzozowski does not simplify under stars. This does not apply
        in this example, though.  }
     
\item The Sulzmann \& Lu algorithm contains the function
      $mkeps$ which answers how a regular expression can match
      the empty string. What is the answer of $mkeps$ for the
      regular expressions:    

  \[
  \begin{array}{l}
  (\ZERO \cdot (b \cdot c)) + 
  ((\ZERO \cdot c) + \ONE)\\
    (a + \ONE) \cdot (\ONE + \ONE)\\
    a^*
  \end{array}
\]

\solution{
  The values are
  \begin{center}
  \begin{tabular}{l}
    $Right(Right(Empty))$\\
    $Sequ(Right(\ONE),Left(\ONE))$\\
    $Stars\,[]$
  \end{tabular}  
  \end{center}

  The last one uses the rule that $mkeps$ for the star returns always $Star\,[]$.
  }

\item What is the purpose of the record regular expression in
  the Sulzmann \& Lu algorithm?

  \solution{
    It marks a part of a regular expression and can be used to extract the part of the
    string that is matched by this marked part of the regular expression.
  }

\item Recall the functions \textit{nullable} and
      \textit{zeroable}.  Define recursive functions
      \textit{atmostempty} (for regular expressions that match no
      string or only the empty string), \textit{somechars} (for
      regular expressions that match some non-empty string),
      \textit{infinitestrings} (for regular expressions that can match
      infinitely many strings).

      \solution{
        \textbf{zeroable}: The property is $z(r) \;\text{iff}\; L(r) = \emptyset$:

        \begin{align}
          z(\ZERO) &\dn true\\
          z(\ONE)  &\dn false\\
          z(c)     &\dn false\\
          z(r_1 + r_2) &\dn z(r_1) \wedge z(r_2)\\
          z(r_1 \cdot r_2) &\dn z(r_1) \vee z(r_2)\\
          z(r^*)  &\dn false
        \end{align}\bigskip

        \textbf{atmostempty}: The property is ``either $L(r) = \emptyset$ or $L(r) = \{[]\}$'', which
        is more formally $a(r) \;\text{iff}\; L(r) \subseteq \{[]\}$:

        \begin{align}
          a(\ZERO) &\dn true\\
          a(\ONE)  &\dn true\\
          a(c)     &\dn false\\
          a(r_1 + r_2) &\dn a(r_1) \wedge a(r_2)\\
          a(r_1 \cdot r_2) &\dn z(r_1) \vee z(r_2) \vee (a(r_1) \wedge a(r_2))\\
          a(r^*)  &\dn a(r)
        \end{align}

        For this it is good to remember the regex should either not
        match anything at all, or just the empty string.\bigskip

        \textbf{somechars}: The property is ``$L(r)$ must contain a string which is not the empty string'', which
        is more formally $s(r) \;\text{iff}\; \exists\,s. s \not= [] \wedge s \in L(r)$:

        \begin{align}
          s(\ZERO) &\dn false\\
          s(\ONE)  &\dn false\\
          s(c)     &\dn true\\
          s(r_1 + r_2) &\dn s(r_1) \vee s(r_2)\\
          s(r_1 \cdot r_2) &\dn (\neg z(r_1) \wedge s(r_2)) \;\vee\; (\neg z(r_2) \wedge s(r_1))\\
          s(r^*)  &\dn s(r)
        \end{align}

        Here the interesting case is $r_1 \cdot r_2$ where one essentially has to make sure
        that one of the regexes is not zeroable, because then the resulting regex cannot match any
        string.\bigskip

        \textbf{infinitestrings}: The property is
        $i(r) \;\text{iff}\; L(r)\;\text{is infinite}$:

        \begin{align}
          i(\ZERO) &\dn false\\
          i(\ONE)  &\dn false\\
          i(c)     &\dn false\\
          i(r_1 + r_2) &\dn i(r_1) \vee i(r_2)\\
          i(r_1 \cdot r_2) &\dn (\neg z(r_1) \wedge i(r_2)) \;\vee\; (\neg z(r_2) \wedge i(r_1))\\
          i(r^*)  &\dn \neg a(r)
        \end{align}

        Here the interesting bit is that as soon $r$ can match at least a single non-empty string, then $r^*$
        will match infinitely many strings.
        }

      
\item There are two kinds of automata that are generated for 
  regular expression matching---DFAs and NFAs. (1) Regular expression engines like
  the one in Python generate NFAs.  Explain what is the problem with such
  NFAs and what is the reason why they use NFAs. (2) Regular expression
  engines like the one in Rust generate DFAs. Explain what is the
  problem with these regex engines and also what is the problem with $a^{\{1000\}}$
  in these engines.

  \solution{
    Why they use NFAs? NFAs are of similar size as the regular expression (they do not explode
    for the basic regular expressions. Python regex library supports constructions like
    back-refernces which cannot be represented by DFAs (string matching with back-references
    can be NP. What is the problem with $a^{\{1000\}}$. When generating DFAs (and NFAs) for the
    bounded regular expressions, one has to make $n$ copies, which means their size can grow
    drastically for large counters.
  }
      
%\item (Optional) The tokenizer in \texttt{regexp3.scala} takes as
%argument a string and a list of rules. The result is a list of tokens. Improve this tokenizer so 
%that it filters out all comments and whitespace from the result.

%\item (Optional) Modify the tokenizer in \texttt{regexp2.scala} so that it
%implements the \texttt{findAll} function. This function takes a regular
%expressions and a string, and returns all substrings in this string that 
      %match the regular expression.

\item \POSTSCRIPT  
\end{enumerate}


\end{document}

%%% Local Variables: 
%%% mode: latex
%%% TeX-master: t
%%% End: