# HG changeset patch # User Christian Urban # Date 1380278984 -3600 # Node ID 95ee5cc5c05dfcf8db272111219009896df4dab6 # Parent 1933e88cb73e071fdfe87589e47907357a86e483 added diff -r 1933e88cb73e -r 95ee5cc5c05d handouts/ho01.pdf Binary file handouts/ho01.pdf has changed diff -r 1933e88cb73e -r 95ee5cc5c05d handouts/ho01.tex --- a/handouts/ho01.tex Fri Sep 27 11:01:31 2013 +0100 +++ b/handouts/ho01.tex Fri Sep 27 11:49:44 2013 +0100 @@ -4,9 +4,47 @@ \usepackage{amssymb} \usepackage{amsmath} \usepackage[T1]{fontenc} +\usepackage{listings} +\usepackage{xcolor} \newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}% +\definecolor{javared}{rgb}{0.6,0,0} % for strings +\definecolor{javagreen}{rgb}{0.25,0.5,0.35} % comments +\definecolor{javapurple}{rgb}{0.5,0,0.35} % keywords +\definecolor{javadocblue}{rgb}{0.25,0.35,0.75} % javadoc + +\lstdefinelanguage{scala}{ + morekeywords={abstract,case,catch,class,def,% + do,else,extends,false,final,finally,% + for,if,implicit,import,match,mixin,% + new,null,object,override,package,% + private,protected,requires,return,sealed,% + super,this,throw,trait,true,try,% + type,val,var,while,with,yield}, + otherkeywords={=>,<-,<\%,<:,>:,\#,@}, + sensitive=true, + morecomment=[l]{//}, + morecomment=[n]{/*}{*/}, + morestring=[b]", + morestring=[b]', + morestring=[b]""" +} + +\lstset{language=Scala, + basicstyle=\ttfamily, + keywordstyle=\color{javapurple}\bfseries, + stringstyle=\color{javagreen}, + commentstyle=\color{javagreen}, + morecomment=[s][\color{javadocblue}]{/**}{*/}, + numbers=left, + numberstyle=\tiny\color{black}, + stepnumber=1, + numbersep=10pt, + tabsize=2, + showspaces=false, + showstringspaces=false} + \begin{document} \section*{Handout 1} @@ -238,15 +276,40 @@ \noindent This means we can now precisely state what the meaning, for example, of the regular expression ${\it h} \cdot {\it e} \cdot {\it l} \cdot {\it l} \cdot {\it o}$ is, namely -$L({\it h} \cdot {\it e} \cdot {\it l} \cdot {\it l} \cdot {\it o}) = \{\text{\it"hello"}\}$. Similarly if we have the choice -$a + b$, the meaning is $L(a + b) = \{\text{\it"a"}, \text{\it"b"}\}$, namely the only two strings which can possibly -be matched by this choice. +$L({\it h} \cdot {\it e} \cdot {\it l} \cdot {\it l} \cdot {\it o}) = \{\text{\it"hello"}\}$...as expected. Similarly if we have the +choice-regular-expression $a + b$, its meaning is $L(a + b) = \{\text{\it"a"}, \text{\it"b"}\}$, namely the only two strings which can possibly +be matched by this choice. You can now also conclude why we do not make a difference +between the different regular expressions $(r_1 + r_2) + r_3$ and $r_1 + (r_2 + r_3)$....they +are not the same regular expression, but have the same meaning. -The point of this definition is that we can now precisely specify when a string is matched by a -regular expression, namely a string, say $s$, is matched by a regular expression, say $r$, if -and only if $s \in L(r)$. In fact we will write a program {\it match} that takes any string $s$ and +The point of the definition of $L$ is that we can now precisely specify when a string $s$ is matched by a +regular expression $r$, namely only when $s \in L(r)$. In fact we will write a program {\it match} that takes any string $s$ and any regular expression $r$ as argument and returns \emph{yes}, if $s \in L(r)$ and \emph{no}, if $s \not\in L(r)$. We leave this for the next lecture. + +\begin{figure}[p] +{\lstset{language=Scala}\texttt{\lstinputlisting{../progs/crawler1.scala}}} +\caption{Scala code for a web-crawler that can detect broken links in a web-page. It uses +the regular expression {\tt http\_pattern} in Line~15 for recognising URL-addresses. It finds +all links using the library function {\tt findAllIn} in Line~21.} +\end{figure} + +\begin{figure}[p] +{\lstset{language=Scala}\texttt{\lstinputlisting{../progs/crawler2.scala}}} +\caption{A version of the web-crawler which only follows links in ``my'' domain---since these are the +ones I am interested in to fix. It uses the regular expression {\tt my\_urls} in Line~16. +The main change is in Line~26 where we test whether URL is in our domain or not.} + +\end{figure} + +\begin{figure}[p] +{\lstset{language=Scala}\texttt{\lstinputlisting{../progs/crawler3.scala}}} +\caption{A small email harvester---whenever we download a web-page, we also check whether +it contains any email addresses. For this we use the regular expression {\tt email\_pattern} in +Line~17. The main change is in Lines 33 and 34 where we print all email addresses +we can find in a page.} +\end{figure} + \end{document} %%% Local Variables: diff -r 1933e88cb73e -r 95ee5cc5c05d progs/crawler1.scala --- a/progs/crawler1.scala Fri Sep 27 11:01:31 2013 +0100 +++ b/progs/crawler1.scala Fri Sep 27 11:49:44 2013 +0100 @@ -5,7 +5,7 @@ import scala.util.matching.Regex import scala.util._ -// gets the first ~10K of a web-page +// gets the first 10K of a web-page def get_page(url: String) : String = { Try(Source.fromURL(url).take(10000).mkString) getOrElse { println(s" Problem with: $url"); ""} @@ -18,7 +18,7 @@ def unquote(s: String) = s.drop(1).dropRight(1) def get_all_URLs(page: String) : Set[String] = { - (http_pattern.findAllIn(page)).map { unquote(_) }.toSet + http_pattern.findAllIn(page).map(unquote).toSet } // naive version - seraches until a given depth @@ -35,6 +35,5 @@ val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/""" //val startURL = """http://www.inf.kcl.ac.uk/staff/mml/""" - crawl(startURL, 2) diff -r 1933e88cb73e -r 95ee5cc5c05d progs/crawler2.scala --- a/progs/crawler2.scala Fri Sep 27 11:01:31 2013 +0100 +++ b/progs/crawler2.scala Fri Sep 27 11:49:44 2013 +0100 @@ -5,16 +5,13 @@ import scala.util.matching.Regex import scala.util._ -// gets the first ~10K of a web-page +// gets the first 10K of a web-page def get_page(url: String) : String = { Try(Source.fromURL(url).take(10000).mkString) getOrElse { println(s" Problem with: $url"); ""} } -// staring URL for the crawler -val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/""" - -// regex for URLs +// regexes for URLs and "my" domain val http_pattern = """\"https?://[^\"]*\"""".r val my_urls = """urbanc""".r @@ -24,8 +21,6 @@ http_pattern.findAllIn(page).map(unquote).toSet } -// naive version - seraches until a given depth -// visits pages potentially more than once def crawl(url: String, n: Int) : Unit = { if (n == 0) () else if (my_urls.findFirstIn(url) == None) () @@ -35,8 +30,10 @@ } } -// can now deal with depth 3 -// start on command line +// staring URL for the crawler +val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/""" + +// can now deal with depth 3 and beyond crawl(startURL, 4) -crawl("""http://www.inf.kcl.ac.uk/staff/urbanc/bsc-projects-13.html""", 2) + diff -r 1933e88cb73e -r 95ee5cc5c05d progs/crawler3.scala --- a/progs/crawler3.scala Fri Sep 27 11:01:31 2013 +0100 +++ b/progs/crawler3.scala Fri Sep 27 11:49:44 2013 +0100 @@ -1,20 +1,16 @@ -// This version of the crawler also -// harvests emails from webpages +// This version of the crawler that also +// "harvests" email addresses from webpages import io.Source import scala.util.matching.Regex import scala.util._ -// gets the first ~10K of a web-page def get_page(url: String) : String = { Try(Source.fromURL(url).take(10000).mkString) getOrElse { println(s" Problem with: $url"); ""} } -// staring URL for the crawler -val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/""" - -// regex for URLs +// regexes for URLs, for "my" domain and for email addresses val http_pattern = """\"https?://[^\"]*\"""".r val my_urls = """urbanc""".r val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r @@ -28,8 +24,6 @@ http_pattern.findAllIn(page).map(unquote).toSet } -// naive version - seraches until a given depth -// visits pages potentially more than once def crawl(url: String, n: Int) : Unit = { if (n == 0) () //else if (my_urls.findFirstIn(url) == None) () @@ -41,4 +35,7 @@ } } +// staring URL for the crawler +val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/""" + crawl(startURL, 3)