2 \usepackage{charter} |
2 \usepackage{charter} |
3 \usepackage{hyperref} |
3 \usepackage{hyperref} |
4 \usepackage{amssymb} |
4 \usepackage{amssymb} |
5 \usepackage{amsmath} |
5 \usepackage{amsmath} |
6 \usepackage[T1]{fontenc} |
6 \usepackage[T1]{fontenc} |
|
7 \usepackage{listings} |
|
8 \usepackage{xcolor} |
7 |
9 |
8 \newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}% |
10 \newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}% |
9 |
11 |
|
12 \definecolor{javared}{rgb}{0.6,0,0} % for strings |
|
13 \definecolor{javagreen}{rgb}{0.25,0.5,0.35} % comments |
|
14 \definecolor{javapurple}{rgb}{0.5,0,0.35} % keywords |
|
15 \definecolor{javadocblue}{rgb}{0.25,0.35,0.75} % javadoc |
|
16 |
|
17 \lstdefinelanguage{scala}{ |
|
18 morekeywords={abstract,case,catch,class,def,% |
|
19 do,else,extends,false,final,finally,% |
|
20 for,if,implicit,import,match,mixin,% |
|
21 new,null,object,override,package,% |
|
22 private,protected,requires,return,sealed,% |
|
23 super,this,throw,trait,true,try,% |
|
24 type,val,var,while,with,yield}, |
|
25 otherkeywords={=>,<-,<\%,<:,>:,\#,@}, |
|
26 sensitive=true, |
|
27 morecomment=[l]{//}, |
|
28 morecomment=[n]{/*}{*/}, |
|
29 morestring=[b]", |
|
30 morestring=[b]', |
|
31 morestring=[b]""" |
|
32 } |
|
33 |
|
34 \lstset{language=Scala, |
|
35 basicstyle=\ttfamily, |
|
36 keywordstyle=\color{javapurple}\bfseries, |
|
37 stringstyle=\color{javagreen}, |
|
38 commentstyle=\color{javagreen}, |
|
39 morecomment=[s][\color{javadocblue}]{/**}{*/}, |
|
40 numbers=left, |
|
41 numberstyle=\tiny\color{black}, |
|
42 stepnumber=1, |
|
43 numbersep=10pt, |
|
44 tabsize=2, |
|
45 showspaces=false, |
|
46 showstringspaces=false} |
|
47 |
10 \begin{document} |
48 \begin{document} |
11 |
49 |
12 \section*{Handout 1} |
50 \section*{Handout 1} |
13 |
51 |
14 This course is about the processing of strings. Lets start with what we mean by \emph{strings}. Strings |
52 This course is about the processing of strings. Lets start with what we mean by \emph{strings}. Strings |
236 \end{center} |
274 \end{center} |
237 |
275 |
238 \noindent |
276 \noindent |
239 This means we can now precisely state what the meaning, for example, of the regular expression |
277 This means we can now precisely state what the meaning, for example, of the regular expression |
240 ${\it h} \cdot {\it e} \cdot {\it l} \cdot {\it l} \cdot {\it o}$ is, namely |
278 ${\it h} \cdot {\it e} \cdot {\it l} \cdot {\it l} \cdot {\it o}$ is, namely |
241 $L({\it h} \cdot {\it e} \cdot {\it l} \cdot {\it l} \cdot {\it o}) = \{\text{\it"hello"}\}$. Similarly if we have the choice |
279 $L({\it h} \cdot {\it e} \cdot {\it l} \cdot {\it l} \cdot {\it o}) = \{\text{\it"hello"}\}$...as expected. Similarly if we have the |
242 $a + b$, the meaning is $L(a + b) = \{\text{\it"a"}, \text{\it"b"}\}$, namely the only two strings which can possibly |
280 choice-regular-expression $a + b$, its meaning is $L(a + b) = \{\text{\it"a"}, \text{\it"b"}\}$, namely the only two strings which can possibly |
243 be matched by this choice. |
281 be matched by this choice. You can now also conclude why we do not make a difference |
244 |
282 between the different regular expressions $(r_1 + r_2) + r_3$ and $r_1 + (r_2 + r_3)$....they |
245 The point of this definition is that we can now precisely specify when a string is matched by a |
283 are not the same regular expression, but have the same meaning. |
246 regular expression, namely a string, say $s$, is matched by a regular expression, say $r$, if |
284 |
247 and only if $s \in L(r)$. In fact we will write a program {\it match} that takes any string $s$ and |
285 The point of the definition of $L$ is that we can now precisely specify when a string $s$ is matched by a |
|
286 regular expression $r$, namely only when $s \in L(r)$. In fact we will write a program {\it match} that takes any string $s$ and |
248 any regular expression $r$ as argument and returns \emph{yes}, if $s \in L(r)$ and \emph{no}, |
287 any regular expression $r$ as argument and returns \emph{yes}, if $s \in L(r)$ and \emph{no}, |
249 if $s \not\in L(r)$. We leave this for the next lecture. |
288 if $s \not\in L(r)$. We leave this for the next lecture. |
|
289 |
|
290 \begin{figure}[p] |
|
291 {\lstset{language=Scala}\texttt{\lstinputlisting{../progs/crawler1.scala}}} |
|
292 \caption{Scala code for a web-crawler that can detect broken links in a web-page. It uses |
|
293 the regular expression {\tt http\_pattern} in Line~15 for recognising URL-addresses. It finds |
|
294 all links using the library function {\tt findAllIn} in Line~21.} |
|
295 \end{figure} |
|
296 |
|
297 \begin{figure}[p] |
|
298 {\lstset{language=Scala}\texttt{\lstinputlisting{../progs/crawler2.scala}}} |
|
299 \caption{A version of the web-crawler which only follows links in ``my'' domain---since these are the |
|
300 ones I am interested in to fix. It uses the regular expression {\tt my\_urls} in Line~16. |
|
301 The main change is in Line~26 where we test whether URL is in our domain or not.} |
|
302 |
|
303 \end{figure} |
|
304 |
|
305 \begin{figure}[p] |
|
306 {\lstset{language=Scala}\texttt{\lstinputlisting{../progs/crawler3.scala}}} |
|
307 \caption{A small email harvester---whenever we download a web-page, we also check whether |
|
308 it contains any email addresses. For this we use the regular expression {\tt email\_pattern} in |
|
309 Line~17. The main change is in Lines 33 and 34 where we print all email addresses |
|
310 we can find in a page.} |
|
311 \end{figure} |
|
312 |
250 \end{document} |
313 \end{document} |
251 |
314 |
252 %%% Local Variables: |
315 %%% Local Variables: |
253 %%% mode: latex |
316 %%% mode: latex |
254 %%% TeX-master: t |
317 %%% TeX-master: t |