added
authorChristian Urban <christian dot urban at kcl dot ac dot uk>
Fri, 27 Sep 2013 11:49:44 +0100
changeset 112 95ee5cc5c05d
parent 111 1933e88cb73e
child 113 db6862f6bf6c
added
handouts/ho01.pdf
handouts/ho01.tex
progs/crawler1.scala
progs/crawler2.scala
progs/crawler3.scala
Binary file handouts/ho01.pdf has changed
--- a/handouts/ho01.tex	Fri Sep 27 11:01:31 2013 +0100
+++ b/handouts/ho01.tex	Fri Sep 27 11:49:44 2013 +0100
@@ -4,9 +4,47 @@
 \usepackage{amssymb}
 \usepackage{amsmath}
 \usepackage[T1]{fontenc}
+\usepackage{listings}
+\usepackage{xcolor}
 
 \newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
 
+\definecolor{javared}{rgb}{0.6,0,0} % for strings
+\definecolor{javagreen}{rgb}{0.25,0.5,0.35} % comments
+\definecolor{javapurple}{rgb}{0.5,0,0.35} % keywords
+\definecolor{javadocblue}{rgb}{0.25,0.35,0.75} % javadoc
+
+\lstdefinelanguage{scala}{
+  morekeywords={abstract,case,catch,class,def,%
+    do,else,extends,false,final,finally,%
+    for,if,implicit,import,match,mixin,%
+    new,null,object,override,package,%
+    private,protected,requires,return,sealed,%
+    super,this,throw,trait,true,try,%
+    type,val,var,while,with,yield},
+  otherkeywords={=>,<-,<\%,<:,>:,\#,@},
+  sensitive=true,
+  morecomment=[l]{//},
+  morecomment=[n]{/*}{*/},
+  morestring=[b]",
+  morestring=[b]',
+  morestring=[b]"""
+}
+
+\lstset{language=Scala,
+	basicstyle=\ttfamily,
+	keywordstyle=\color{javapurple}\bfseries,
+	stringstyle=\color{javagreen},
+	commentstyle=\color{javagreen},
+	morecomment=[s][\color{javadocblue}]{/**}{*/},
+	numbers=left,
+	numberstyle=\tiny\color{black},
+	stepnumber=1,
+	numbersep=10pt,
+	tabsize=2,
+	showspaces=false,
+	showstringspaces=false}
+	
 \begin{document}
 
 \section*{Handout 1}
@@ -238,15 +276,40 @@
 \noindent
 This means we can now precisely state what the meaning, for example, of the regular expression 
 ${\it h} \cdot {\it e} \cdot {\it l} \cdot {\it l} \cdot {\it o}$ is, namely 
-$L({\it h} \cdot {\it e} \cdot {\it l} \cdot {\it l} \cdot {\it o}) = \{\text{\it"hello"}\}$. Similarly if we have the choice
-$a + b$, the meaning is $L(a + b) = \{\text{\it"a"}, \text{\it"b"}\}$, namely the only two strings which can possibly
-be matched by this choice. 
+$L({\it h} \cdot {\it e} \cdot {\it l} \cdot {\it l} \cdot {\it o}) = \{\text{\it"hello"}\}$...as expected. Similarly if we have the 
+choice-regular-expression $a + b$, its meaning is $L(a + b) = \{\text{\it"a"}, \text{\it"b"}\}$, namely the only two strings which can possibly
+be matched by this choice. You can now also conclude why we do not make a difference
+between the different regular expressions $(r_1 + r_2) + r_3$ and $r_1 + (r_2 + r_3)$....they 
+are not the same regular expression, but have the same meaning. 
 
-The point of this definition is that we can now precisely specify when a string is matched by a
-regular expression, namely a string, say $s$, is matched by a regular expression, say $r$, if
-and only if $s \in L(r)$. In fact we will write a program {\it match} that takes any string $s$ and
+The point of the definition of $L$ is that we can now precisely specify when a string $s$ is matched by a
+regular expression $r$, namely only when $s \in L(r)$. In fact we will write a program {\it match} that takes any string $s$ and
 any regular expression $r$ as argument and returns \emph{yes}, if $s \in L(r)$ and \emph{no},
 if $s \not\in L(r)$. We leave this for the next lecture.
+
+\begin{figure}[p]
+{\lstset{language=Scala}\texttt{\lstinputlisting{../progs/crawler1.scala}}}
+\caption{Scala code for a web-crawler that can detect broken links in a web-page. It uses
+the regular expression {\tt http\_pattern} in Line~15 for recognising URL-addresses. It finds
+all links using the library function {\tt findAllIn} in Line~21.}
+\end{figure}
+
+\begin{figure}[p]
+{\lstset{language=Scala}\texttt{\lstinputlisting{../progs/crawler2.scala}}}
+\caption{A version of the web-crawler which only follows links in ``my'' domain---since these are the
+ones I am interested in to fix. It uses the regular expression {\tt my\_urls} in Line~16.
+The main change is in Line~26 where we test whether URL is in our domain or not.}
+
+\end{figure}
+
+\begin{figure}[p]
+{\lstset{language=Scala}\texttt{\lstinputlisting{../progs/crawler3.scala}}}
+\caption{A small email harvester---whenever we download a web-page, we also check whether
+it contains any email addresses. For this we use the regular expression {\tt email\_pattern} in
+Line~17.  The main change is in Lines 33 and 34 where we print all email addresses
+we can find in a page.}
+\end{figure}
+
 \end{document}
 
 %%% Local Variables: 
--- a/progs/crawler1.scala	Fri Sep 27 11:01:31 2013 +0100
+++ b/progs/crawler1.scala	Fri Sep 27 11:49:44 2013 +0100
@@ -5,7 +5,7 @@
 import scala.util.matching.Regex
 import scala.util._
 
-// gets the first ~10K of a web-page
+// gets the first 10K of a web-page
 def get_page(url: String) : String = {
   Try(Source.fromURL(url).take(10000).mkString) getOrElse 
     { println(s"  Problem with: $url"); ""}
@@ -18,7 +18,7 @@
 def unquote(s: String) = s.drop(1).dropRight(1)
 
 def get_all_URLs(page: String) : Set[String] = {
-  (http_pattern.findAllIn(page)).map { unquote(_) }.toSet
+  http_pattern.findAllIn(page).map(unquote).toSet
 }
 
 // naive version - seraches until a given depth
@@ -35,6 +35,5 @@
 val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/"""
 //val startURL = """http://www.inf.kcl.ac.uk/staff/mml/"""
 
-
 crawl(startURL, 2)
 
--- a/progs/crawler2.scala	Fri Sep 27 11:01:31 2013 +0100
+++ b/progs/crawler2.scala	Fri Sep 27 11:49:44 2013 +0100
@@ -5,16 +5,13 @@
 import scala.util.matching.Regex
 import scala.util._
 
-// gets the first ~10K of a web-page
+// gets the first 10K of a web-page
 def get_page(url: String) : String = {
   Try(Source.fromURL(url).take(10000).mkString) getOrElse 
     { println(s"  Problem with: $url"); ""}
 }
 
-// staring URL for the crawler
-val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/"""
-
-// regex for URLs
+// regexes for URLs and "my" domain
 val http_pattern = """\"https?://[^\"]*\"""".r
 val my_urls = """urbanc""".r
 
@@ -24,8 +21,6 @@
   http_pattern.findAllIn(page).map(unquote).toSet
 }
 
-// naive version - seraches until a given depth
-// visits pages potentially more than once
 def crawl(url: String, n: Int) : Unit = {
   if (n == 0) ()
   else if (my_urls.findFirstIn(url) == None) ()
@@ -35,8 +30,10 @@
   }
 }
 
-// can now deal with depth 3
-// start on command line
+// staring URL for the crawler
+val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/"""
+
+// can now deal with depth 3 and beyond
 crawl(startURL, 4)
 
-crawl("""http://www.inf.kcl.ac.uk/staff/urbanc/bsc-projects-13.html""", 2)
+
--- a/progs/crawler3.scala	Fri Sep 27 11:01:31 2013 +0100
+++ b/progs/crawler3.scala	Fri Sep 27 11:49:44 2013 +0100
@@ -1,20 +1,16 @@
-// This version of the crawler also
-// harvests emails from webpages
+// This version of the crawler that also
+// "harvests" email addresses from webpages
 
 import io.Source
 import scala.util.matching.Regex
 import scala.util._
 
-// gets the first ~10K of a web-page
 def get_page(url: String) : String = {
   Try(Source.fromURL(url).take(10000).mkString) getOrElse 
     { println(s"  Problem with: $url"); ""}
 }
 
-// staring URL for the crawler
-val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/"""
-
-// regex for URLs
+// regexes for URLs, for "my" domain and for email addresses
 val http_pattern = """\"https?://[^\"]*\"""".r
 val my_urls = """urbanc""".r
 val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r
@@ -28,8 +24,6 @@
   http_pattern.findAllIn(page).map(unquote).toSet
 }
 
-// naive version - seraches until a given depth
-// visits pages potentially more than once
 def crawl(url: String, n: Int) : Unit = {
   if (n == 0) ()
   //else if (my_urls.findFirstIn(url) == None) ()
@@ -41,4 +35,7 @@
   }
 }
 
+// staring URL for the crawler
+val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/"""
+
 crawl(startURL, 3)