hws/hw05.tex
author Christian Urban <christian.urban@kcl.ac.uk>
Fri, 24 Oct 2025 08:55:14 +0100
changeset 1016 c02d409ed7f4
parent 964 da1f8c033b8e
permissions -rw-r--r--
added amm faq

\documentclass{article}
\usepackage{../style}
\usepackage{../graphics}
\usepackage{../grammar}

\begin{document}

% explain what is a context-free grammar and the language it generates 
%


\section*{Homework 5}

%%%\HEADER


\begin{enumerate}
\item Consider the basic regular expressions

\begin{center}
$r ::= \ZERO \;|\; \ONE \;|\; c  \;|\; r_1 + r_2  \;|\; r_1 \cdot r_2 \;|\; r^*$
\end{center}

      and suppose you want to show a property $P(r)$ for all
      regular expressions $r$ by structural induction. Write
      down which cases do you need to analyse. State clearly
      the induction hypotheses if applicable in a case.

\item Define a regular expression, written $ALL$, that can
      match every string. This definition should be in terms
      of the following extended regular expressions:

\begin{center}
$r ::= \ZERO \;|\; 
       \ONE \;|\;  
       c  \;|\; 
       r_1 + r_2 \;|\; 
       r_1 \cdot r_2 \;|\; 
       r^* \;|\;
       \sim r$
\end{center}

\solution{
  There is the obvious solution $\sim{}\ZERO$, but also $a + \sim{}a$ would work.
}  
     

%\item Assume the delimiters for comments are \texttt{$\slash$*}
%and \texttt{*$\slash$}. Give a regular expression that can
%recognise comments of the form
%
%\begin{center}
%\texttt{$\slash$*~\ldots{}~*$\slash$} 
%\end{center}
%
%where the three dots stand for arbitrary characters, but not
%comment delimiters.

\item The \emph{not}-regular expression is definitely useful for recognising
  comments for example, but can
  sometimes be quite unintuitive when it comes to deciding which strings are
  matched or not. Consider

  \[
  (\sim{}a)^*  \quad\text{and}\quad \sim{}(a^*)\;.  
  \]  

  What is the language of each regular expression?

  \solution{
    The first one is ``all strings, except $[a]$''; the second
    ``all strings except strings of the form $a^*$''.
  }
  
\item Define the following regular expressions 

\begin{center}
\begin{tabular}{ll}
$r^+$ & (one or more matches)\\
$r^?$   & (zero or one match)\\
$r^{\{n\}}$ & (exactly $n$ matches)\\
$r^{\{m.. n\}}$ & (at least $m$ and maximal $n$ matches, with the\\
&  \phantom{(}assumption $m \le n$)\\
\end{tabular}
\end{center}

in terms of the usual basic regular expressions

\begin{center}
$r ::= \ZERO \;|\; \ONE \;|\; c  \;|\; r_1 + r_2  \;|\; r_1 \cdot r_2 \;|\; r^*$
\end{center}

\solution{
  $r^+ \dn r\cdot r^*$\\
  $r^? \dn r + 1$\\
  $r^{\{0\}} = \ONE$\\
  $r^{\{n\}} \dn r\cdot r^{\{n-1\}}$\\
  $r^{\{..n\}} \dn (r^?)^{\{n\}}$\\
  $r^{\{n..m\}} \dn r^{\{..m-n\}}\cdot r^{\{n\}}$\\

  BTW, $r^{\{n..m\}}$ cannot be defined in terms of $r^{\{n..\}} \;\&\; r^{\{..m\}}$ where $\&$ is
  the intersection operator I introduced this year. For example assume $r=aaa + aaaaaaa$, then 
  $r^{\{4..6\}}$ cannot match 21 a's, but $r^{\{4..\}} \;\&\; r^{\{..6\}}$.
  }


\item Give the regular expressions for lexing a language
      consisting of identifiers, left-parenthesis \texttt{(},
      right-parenthesis \texttt{)}, numbers that can be either
      positive or negative, and the operations \texttt{+},
      \texttt{-} and \texttt{*}. 

      Decide whether the following strings can be lexed in
      this language?

\begin{enumerate}
\item \texttt{"(a3+3)*b"}
\item \texttt{")()++-33"}
\item \texttt{"(b42/3)*3"}
\end{enumerate}

In case they can, give the corresponding token sequences. (Hint: 
Observe the maximal munch rule and the priorities of your regular
expressions that make the process of lexing unambiguous.)

\solution{
  The first two strings can be lexed. But not the last ($/$ is not part of the language).
}

\item Suppose the following context-free grammar $G$
  
\begin{plstx}[margin=1cm]
  : \meta{S\/} ::= \meta{A\/}\cdot\meta{S\/}\cdot\meta{B\/} \;\mid\;
                   \meta{B\/}\cdot\meta{S\/}\cdot\meta{A\/} \;\mid\; \epsilon\\
  : \meta{A\/} ::= a \mid \epsilon\\
  : \meta{B\/} ::= b\\
\end{plstx}

where the starting symbol is $\meta{S}$.
Which of the following strings are in the language of $G$?

\begin{itemize}
\item[$\bullet$] $a$
\item[$\bullet$] $b$
\item[$\bullet$] $ab$
\item[$\bullet$] $ba$
\item[$\bullet$] $bb$
\item[$\bullet$] $baa$
\end{itemize}

\solution{
  The first and the last cannot be matched. Maybe it is a good exercise to
  write down the derivations for the rest.

  BTW, the language recognised by this grammar is strings consisting of
  a's and b's where there are equal or more number of b's than a's (including the
  empty string).
}

\item Suppose the following context-free grammar 
  
  \begin{plstx}[margin=1cm]
  : \meta{S\/} ::= a\cdot \meta{S\/}\cdot a\;\mid\;
                   b\cdot \meta{S\/}\cdot b\;\mid\; \epsilon\\
  \end{plstx}

Describe which language is generated by this grammar.

\solution{Palindromes with the same number of a's and b's, including
  the empty string}


\item Remember we have specified identifiers with regular expressions as
  strings that start with a letter followed by letters, digits and
  underscores. This can also be specified by a grammar rule or rules.
  What would the rule(s) look like for identifiers? 

  \solution{
  \begin{plstx}[margin=1cm]
  : \meta{Id\/} ::= \meta{Let\/}\cdot \meta{R}\\
  : \meta{Let\/} ::= a \;\mid\; \dots \;\mid\; z\\
  : \meta{Dig\/} ::= 0 \;\mid\; \dots \;\mid\; 9\\
  : \meta{R\/} ::= \meta{Let\/} \cdot \meta{R\/} \;\mid\;
                  \meta{Dig\/} \cdot \meta{R\/} \;\mid\;
                  $\_$ \cdot \meta{R\/} \;\mid\; \epsilon\\
  \end{plstx}  
}

\item If we specify keywords, identifiers (see above) and programs
  by grammar rules, are there any problems you need to be careful
  about when using a parser for identifying tokens?

  \solution{Parsers do not have the POSIX rules (e.g.~longest munch
    rule) built in. I am not aware that any parser does this out of
    the box and you would need to build in such constraints into the
    grammar rules or parsing mechanism.}

\item {\bf(Optional)} Recall the definitions for $Der$ and $der$
      from the lectures. Prove by induction on $r$ the
      property that 

\[
L(der\,c\,r) = Der\,c\,(L(r))
\]

      holds.

\item \POSTSCRIPT        
\end{enumerate}

\end{document}

%%% Local Variables: 
%%% mode: latex
%%% TeX-master: t
%%% End: