# HG changeset patch # User Christian Urban # Date 1380897610 -3600 # Node ID a75f9c9d8f940adef215f8f280f4244e7db6ce2a # Parent 4123344e6915957433cba2229ac5e59286466248 added diff -r 4123344e6915 -r a75f9c9d8f94 handouts/ho01.pdf Binary file handouts/ho01.pdf has changed diff -r 4123344e6915 -r a75f9c9d8f94 handouts/ho01.tex --- a/handouts/ho01.tex Wed Oct 02 01:16:30 2013 +0100 +++ b/handouts/ho01.tex Fri Oct 04 15:40:10 2013 +0100 @@ -108,13 +108,13 @@ \] \noindent -The reason is that for example before the $@$-sign there can be any string you want assuming it +One reason is that before the $@$-sign there can be any string you want assuming it is made up from letters, digits, underscores, dots and hyphens---clearly there are infinitely many of those. Similarly the string after the $@$-sign can be any string. However, this does not mean that every string is an email address. For example \[ -\text{\it foo}@\text{\it bar}.\text{\it c} +"\text{\it foo}@\text{\it bar}.\text{\it c}" \] \noindent @@ -138,7 +138,7 @@ like to think about what this definition means in case $A$ or $B$ is the empty set. We also need to define -the power of a set, written as $A^n$ with $n$ being a natural number. This is defined inductively as follows +the power of a set of strings, written as $A^n$ with $n$ being a natural number. This is defined inductively as follows \begin{center} \begin{tabular}{rcl} @@ -168,8 +168,8 @@ Be aware that these operations sometimes have quite non-intuitive properties, for example \begin{center} -\begin{tabular}{ccc} -\begin{tabular}{r@{\hspace{1mm}}c@{\hspace{1mm}}l} +\begin{tabular}{@{}ccc@{}} +\begin{tabular}{@{}r@{\hspace{1mm}}c@{\hspace{1mm}}l} $A \cup \varnothing$ & $=$ & $A$\\ $A \cup A$ & $=$ & $A$\\ $A \cup B$ & $=$ & $B \cup A$\\ @@ -179,7 +179,7 @@ $A @ \varnothing$ & $=$ & $\varnothing @ A = \varnothing$\\ $A @ \{""\}$ & $=$ & $\{""\} @ A = A$\\ \end{tabular} & -\begin{tabular}{r@{\hspace{1mm}}c@{\hspace{1mm}}l} +\begin{tabular}{r@{\hspace{1mm}}c@{\hspace{1mm}}l@{}} $\varnothing^*$ & $=$ & $\{""\}$\\ $\{""\}^*$ & $=$ & $\{""\}$\\ $A^\star$ & $=$ & $\{""\} \cup A\cdot A^*$\\ @@ -208,14 +208,15 @@ \end{center} \noindent -Because we overload our notation there are some subtleties you should be aware of. The letter $c$ stands for any character from the +Because we overload our notation, there are some subtleties you should be aware of. First, the letter +$c$ stands for any character from the alphabet at hand. Second, we will use parentheses to disambiguate regular expressions. For example we will write $(r_1 + r_2)^*$, which is different from, say $r_1 + (r_2)^*$. The former means roughly zero or more times $r_1$ or $r_2$, while the latter means $r_1$ or zero or more times $r_2$. We should also write $(r_1 + r_2) + r_3$, which is different from the regular expression $r_1 + (r_2 + r_3)$, but in case of $+$ and $\cdot$ we actually do not care about the order and just write $r_1 + r_2 + r_3$, or $r_1 \cdot r_2 \cdot r_3$, respectively. The reasons for this will become clear shortly. In the literature you will often find that the choice -$r_1 + r_2$ is written as $r_1\mid{}r_2$. Also following the convention in the literature, we will in case of $\cdot$ even +$r_1 + r_2$ is written as $r_1\mid{}r_2$ or $r_1\mid\mid{}r_2$. Also following the convention in the literature, we will in case of $\cdot$ even often omit it all together. For example the regular expression for email addresses shown above is meant to be of the form \[ @@ -224,7 +225,7 @@ \noindent meaning first comes a name (specified by the regular expression $([\ldots])^+$), then an $@$-sign, then -a domain name (specified by the regular expression $([\ldots])^+$), then a top-level domain. Similarly if +a domain name (specified by the regular expression $([\ldots])^+$), then a dot and then a top-level domain. Similarly if we want to specify the regular expression for the string {\it "hello"} we should write \[ @@ -234,8 +235,8 @@ \noindent but often just write {\it hello}. -Another source of confusion might arise from the fact that we use the term \emph{regular expressions} for the ones used in ``theory'' -and also the ones in ``practice''. In this course we refer by default to the regular expressions defined by the grammar above. +Another source of confusion might arise from the fact that we use the term \emph{regular expression} for the regular expressions used in ``theory'' +and also the ones used in ``practice''. In this course we refer by default to the regular expressions defined by the grammar above. In ``practice'' we often use $r^+$ to stand for one or more times, $\backslash{}d$ to stand for a digit, $r^?$ to stand for an optional regular expression, or ranges such as $[\text{\it a - z}]$ to stand for any lower case letter from $a$ to $z$. They are however mere convenience as they can be seen as shorthand for @@ -254,12 +255,12 @@ expression is supposed to stand for every string \emph{not} matched by a regular expression. We will write such not-regular-expressions as $\sim{}r$. While being ``convenience'' it is often not so clear what the shorthand for these kind of not-regular-expressions is. Try to write down the regular expression which can match any -string except {\it "hello"} and {\it "world"}. It is possible in principle, but often it is easier to just include -$\sim{}r$ in the definition or regular expressions. Whenever we do so, we will state it explicitly. +string except the two strings {\it "hello"} and {\it "world"}. It is possible in principle, but often it is easier to just include +$\sim{}r$ in the definition of regular expressions. Whenever we do so, we will state it explicitly. So far we have only considered informally what the \emph{meaning} of a regular expression is. To do so more formally we will associate with every regular expression a set of strings -that is supposed to be are matched by this +that is supposed to be matched by this regular expression. This can be defined recursively as follows \begin{center} @@ -274,15 +275,15 @@ \end{center} \noindent -This means we can now precisely state what the meaning, for example, of the regular expression +As a result we can now precisely state what the meaning, for example, of the regular expression ${\it h} \cdot {\it e} \cdot {\it l} \cdot {\it l} \cdot {\it o}$ is, namely $L({\it h} \cdot {\it e} \cdot {\it l} \cdot {\it l} \cdot {\it o}) = \{\text{\it"hello"}\}$...as expected. Similarly if we have the choice-regular-expression $a + b$, its meaning is $L(a + b) = \{\text{\it"a"}, \text{\it"b"}\}$, namely the only two strings which can possibly -be matched by this choice. You can now also conclude why we do not make a difference +be matched by this choice. You can now also see why we do not make a difference between the different regular expressions $(r_1 + r_2) + r_3$ and $r_1 + (r_2 + r_3)$....they are not the same regular expression, but have the same meaning. -The point of the definition of $L$ is that we can now precisely specify when a string $s$ is matched by a +The point of the definition of $L$ is that we can use it to precisely specify when a string $s$ is matched by a regular expression $r$, namely only when $s \in L(r)$. In fact we will write a program {\it match} that takes any string $s$ and any regular expression $r$ as argument and returns \emph{yes}, if $s \in L(r)$ and \emph{no}, if $s \not\in L(r)$. We leave this for the next lecture. diff -r 4123344e6915 -r a75f9c9d8f94 handouts/ho02.pdf Binary file handouts/ho02.pdf has changed diff -r 4123344e6915 -r a75f9c9d8f94 handouts/ho02.tex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/handouts/ho02.tex Fri Oct 04 15:40:10 2013 +0100 @@ -0,0 +1,74 @@ +\documentclass{article} +\usepackage{charter} +\usepackage{hyperref} +\usepackage{amssymb} +\usepackage{amsmath} +\usepackage[T1]{fontenc} +\usepackage{listings} +\usepackage{xcolor} + +\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}% + +\definecolor{javared}{rgb}{0.6,0,0} % for strings +\definecolor{javagreen}{rgb}{0.25,0.5,0.35} % comments +\definecolor{javapurple}{rgb}{0.5,0,0.35} % keywords +\definecolor{javadocblue}{rgb}{0.25,0.35,0.75} % javadoc + +\lstdefinelanguage{scala}{ + morekeywords={abstract,case,catch,class,def,% + do,else,extends,false,final,finally,% + for,if,implicit,import,match,mixin,% + new,null,object,override,package,% + private,protected,requires,return,sealed,% + super,this,throw,trait,true,try,% + type,val,var,while,with,yield}, + otherkeywords={=>,<-,<\%,<:,>:,\#,@}, + sensitive=true, + morecomment=[l]{//}, + morecomment=[n]{/*}{*/}, + morestring=[b]", + morestring=[b]', + morestring=[b]""" +} + +\lstset{language=Scala, + basicstyle=\ttfamily, + keywordstyle=\color{javapurple}\bfseries, + stringstyle=\color{javagreen}, + commentstyle=\color{javagreen}, + morecomment=[s][\color{javadocblue}]{/**}{*/}, + numbers=left, + numberstyle=\tiny\color{black}, + stepnumber=1, + numbersep=10pt, + tabsize=2, + showspaces=false, + showstringspaces=false} + +\begin{document} + +\section*{Handout 2} + +Having specified what problem our matching algorithm, $match$, is supposed to solve, namely +for a given regular expression $r$ and string $s$ answer $true$ if and only if + +\[ +s \in L(r) +\] + +\noindent +Clearly we cannot use the function $L$ directly in order to solve this problem, because in general +the set of strings $L$ returns is infinite (recall what $L(a^*)$ is). In such cases there is no algorithm +then can test exhaustively, whether a string is member of this set. + +The algorithm we define below consists of two parts. One is the function $nullable$ which takes a +regular expression as argument and decides whether it can match the empty string. This can be easily +defined recursively as follows: + + +\end{document} + +%%% Local Variables: +%%% mode: latex +%%% TeX-master: t +%%% End: