progs/lecture3.scala
changeset 481 e03a0100ec46
parent 449 d67c5f7177a6
child 493 244df77507c2
--- a/progs/lecture3.scala	Mon Nov 06 21:49:55 2023 +0000
+++ b/progs/lecture3.scala	Fri Dec 08 00:54:36 2023 +0000
@@ -1,37 +1,218 @@
 // Scala Lecture 3
 //=================
 
-// - Higher-Order functions
-// - maps (behind for-comprehensions)
+// last week:
+// higher-order functions
+// maps
 
+// - recursion
+// - Sudoku
+// - string interpolations
 // - Pattern-Matching
 
-def fib(n: Int) : Int = n match {
-  case 0 => 1
-  case 1 =>  1
-  case n => fib(n - 1) + fib(n - 2)
+// A Recursive Web Crawler / Email Harvester
+//===========================================
+//
+// the idea is to look for links using the
+// regular expression "https?://[^"]*" and for
+// email addresses using another regex.
+
+import io.Source
+import scala.util._
+
+// gets the first 10K of a web-page
+def get_page(url: String) : String = {
+  Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString).
+    getOrElse { println(s"  Problem with: $url"); ""}
+}
+
+// regex for URLs and emails
+val http_pattern = """"https?://[^"]*"""".r
+val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r
+
+//test case:
+//email_pattern.findAllIn
+//  ("foo bla christian@kcl.ac.uk 1234567").toList
+
+
+// drops the first and last character from a string
+def unquote(s: String) = s.drop(1).dropRight(1)
+
+def get_all_URLs(page: String): Set[String] = 
+  http_pattern.findAllIn(page).map(unquote).toSet
+
+// naive version of crawl - searches until a given depth,
+// visits pages potentially more than once
+def crawl(url: String, n: Int) : Unit = {
+  if (n == 0) ()
+  else {
+    println(s"  Visiting: $n $url")
+    for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1)
+  }
+}
+
+// some starting URLs for the crawler
+val startURL = """https://nms.kcl.ac.uk/christian.urban/"""
+
+crawl(startURL, 2)
+
+
+// a primitive email harvester
+def emails(url: String, n: Int) : Set[String] = {
+  if (n == 0) Set()
+  else {
+    println(s"  Visiting: $n $url")
+    val page = get_page(url)
+    val new_emails = email_pattern.findAllIn(page).toSet
+    new_emails ++ (for (u <- get_all_URLs(page)) yield emails(u, n - 1)).flatten
+  }
+}
+
+emails(startURL, 2)
+
+
+
+// Sudoku 
+//========
+
+// THE POINT OF THIS CODE IS NOT TO BE SUPER
+// EFFICIENT AND FAST, just explaining exhaustive
+// depth-first search
+
+
+val game0 = """.14.6.3..
+              |62...4..9
+              |.8..5.6..
+              |.6.2....3
+              |.7..1..5.
+              |5....9.6.
+              |..6.2..3.
+              |1..5...92
+              |..7.9.41.""".stripMargin.replaceAll("\\n", "")
+
+type Pos = (Int, Int)
+val EmptyValue = '.'
+val MaxValue = 9
+
+def pretty(game: String): String = 
+  "\n" + (game.grouped(MaxValue).mkString("\n"))
+
+pretty(game0)
+
+
+val allValues = "123456789".toList
+val indexes = (0 to 8).toList
+
+def empty(game: String) = game.indexOf(EmptyValue)
+def isDone(game: String) = empty(game) == -1 
+def emptyPosition(game: String) : Pos = {
+  val e = empty(game)
+  (e % MaxValue, e / MaxValue)
+}
+
+def get_row(game: String, y: Int) = 
+  indexes.map(col => game(y * MaxValue + col))
+def get_col(game: String, x: Int) = 
+  indexes.map(row => game(x + row * MaxValue))
+
+//get_row(game0, 0)
+//get_row(game0, 1)
+//get_col(game0, 0)
+
+def get_box(game: String, pos: Pos): List[Char] = {
+    def base(p: Int): Int = (p / 3) * 3
+    val x0 = base(pos._1)
+    val y0 = base(pos._2)
+    val ys = (y0 until y0 + 3).toList
+    (x0 until x0 + 3).toList
+      .flatMap(x => ys.map(y => game(x + y * MaxValue)))
 }
 
 
-abstract class Rexp
-case object ZERO extends Rexp                      // matches nothing
-case object ONE extends Rexp                       // matches the empty string
-case class CHAR(c: Char) extends Rexp              // matches a character c
-case class ALT(r1: Rexp, r2: Rexp) extends Rexp    // alternative
-case class SEQ(r1: Rexp, r2: Rexp) extends Rexp    // sequence
-case class STAR(r: Rexp) extends Rexp              // star
+//get_box(game0, (3, 1))
+
+
+// this is not mutable!!
+def update(game: String, pos: Int, value: Char): String = 
+  game.updated(pos, value)
+
+def toAvoid(game: String, pos: Pos): List[Char] = 
+  (get_col(game, pos._1) ++ 
+   get_row(game, pos._2) ++ 
+   get_box(game, pos))
 
-def depth(r: Rexp) : Int = r match {
-  case ZERO => 1
-  case ONE => 1
-  case CHAR(_) => 1
-  case ALT(r1, r2) => 1 + List(depth(r1), depth(r2)).max
-  case SEQ(r1, r2) => 1 + List(depth(r1), depth(r2)).max
-  case STAR(r1) => 1 + depth(r1)
+def candidates(game: String, pos: Pos): List[Char] = 
+  allValues.diff(toAvoid(game, pos))
+
+//candidates(game0, (0,0))
+
+
+def search(game: String): List[String] = {
+  if (isDone(game)) List(game)
+  else {
+    val cs = candidates(game, emptyPosition(game))
+    cs.par.map(c => search(update(game, empty(game), c))).flatten.toList
+  }
 }
 
+pretty(game0)
+search(game0).map(pretty)
 
-// - String-Interpolations
+val game1 = """23.915...
+              |...2..54.
+              |6.7......
+              |..1.....9
+              |89.5.3.17
+              |5.....6..
+              |......9.5
+              |.16..7...
+              |...329..1""".stripMargin.replaceAll("\\n", "")
+
+search(game1).map(pretty)
+
+// a game that is in the hard category
+val game2 = """8........
+              |..36.....
+              |.7..9.2..
+              |.5...7...
+              |....457..
+              |...1...3.
+              |..1....68
+              |..85...1.
+              |.9....4..""".stripMargin.replaceAll("\\n", "")
+
+search(game2).map(pretty)
+
+// game with multiple solutions
+val game3 = """.8...9743
+              |.5...8.1.
+              |.1.......
+              |8....5...
+              |...8.4...
+              |...3....6
+              |.......7.
+              |.3.5...8.
+              |9724...5.""".stripMargin.replaceAll("\\n", "")
+
+search(game3).map(pretty).foreach(println)
+
+// for measuring time
+def time_needed[T](i: Int, code: => T) = {
+  val start = System.nanoTime()
+  for (j <- 1 to i) code
+  val end = System.nanoTime()
+  s"${(end - start) / 1.0e9} secs"
+}
+
+time_needed(2, search(game2))
+
+
+// concurrency 
+// scala-cli --extra-jars scala-parallel-collections_3-1.0.4.jar 
+// import scala.collection.parallel.CollectionConverters._
+
+
+
 
 // String Interpolations
 //=======================
@@ -63,19 +244,6 @@
 gcd_db(48, 18)
 
 
-// naive quicksort with "On" function
-
-def sortOn(f: Int => Int, xs: List[Int]) : List[Int] = {
-  if (xs.size < 2) xs
-  else {
-   val pivot = xs.head
-   val (left, right) = xs.partition(f(_) < f(pivot))
-   sortOn(f, left) ::: pivot :: sortOn(f, right.tail)
-  }
-} 
-
-sortOn(identity, List(99,99,99,98,10,-3,2)) 
-sortOn(n => - n, List(99,99,99,98,10,-3,2))
 
 
 // Recursion Again ;o)
@@ -101,8 +269,103 @@
 
 
 
-// User-defined Datatypes
-//========================
+// Pattern Matching
+//==================
+
+// A powerful tool which has even landed in Java during 
+// the last few years (https://inside.java/2021/06/13/podcast-017/).
+// ...Scala already has it for many years and the concept is
+// older than your friendly lecturer, that is stone old  ;o)
+
+// The general schema:
+//
+//    expression match {
+//       case pattern1 => expression1
+//       case pattern2 => expression2
+//       ...
+//       case patternN => expressionN
+//    }
+
+
+// recall
+def len(xs: List[Int]) : Int = {
+    if (xs == Nil) 0
+    else 1 + len(xs.tail)
+}    
+
+def len(xs: List[Int]) : Int = xs match {
+    case Nil => 0
+    case hd::tail => 1 + len(tail)
+}  
+
+
+def my_map_int(lst: List[Int], f: Int => Int) : List[Int] = 
+  lst match {
+    case Nil => Nil
+    case x::xs => f(x)::my_map_int(xs, f)
+  }
+
+def my_map_option(opt: Option[Int], f: Int => Int) : Option[Int] = 
+  opt match {
+    case None => None
+    case Some(x) => Some(f(x))
+  }
+
+my_map_option(None, x => x * x)
+my_map_option(Some(8), x => x * x)
+
+
+// you can also have cases combined
+def season(month: String) : String = month match {
+  case "March" | "April" | "May" => "It's spring"
+  case "June" | "July" | "August" => "It's summer"
+  case "September" | "October" | "November" => "It's autumn"
+  case "December" => "It's winter"
+  case "January" | "February" => "It's unfortunately winter"
+  case _ => "Wrong month"
+}
+
+// pattern-match on integers
+
+def fib(n: Int) : Int = n match { 
+  case 0 | 1 => 1
+  case n => fib(n - 1) + fib(n - 2)
+}
+
+fib(10)
+
+// pattern-match on results
+
+// Silly: fizz buzz
+def fizz_buzz(n: Int) : String = (n % 3, n % 5) match {
+  case (0, 0) => "fizz buzz"
+  case (0, _) => "fizz"
+  case (_, 0) => "buzz"
+  case _ => n.toString  
+}
+
+for (n <- 1 to 20) 
+ println(fizz_buzz(n))
+
+// guards in pattern-matching
+
+def foo(xs: List[Int]) : String = xs match {
+  case Nil => s"this list is empty"
+  case x :: xs if x % 2 == 0 
+     => s"the first elemnt is even"
+  case x :: y :: rest if x == y
+     => s"this has two elemnts that are the same"
+  case hd :: tl => s"this list is standard $hd::$tl"
+}
+
+foo(Nil)
+foo(List(1,2,3))
+foo(List(1,2))
+foo(List(1,1,2,3))
+foo(List(2,2,2,3))
+
+
+// Trees
 
 abstract class Tree
 case class Leaf(x: Int) extends Tree
@@ -182,6 +445,27 @@
 RomanNumeral2Int(List(M,M,X,V,I,I))     // 2017
 
 
+abstract class Rexp
+case object ZERO extends Rexp                      // matches nothing
+case object ONE extends Rexp                       // matches the empty string
+case class CHAR(c: Char) extends Rexp              // matches a character c
+case class ALT(r1: Rexp, r2: Rexp) extends Rexp    // alternative
+case class SEQ(r1: Rexp, r2: Rexp) extends Rexp    // sequence
+case class STAR(r: Rexp) extends Rexp              // star
+
+def depth(r: Rexp) : Int = r match {
+  case ZERO => 1
+  case ONE => 1
+  case CHAR(_) => 1
+  case ALT(r1, r2) => 1 + List(depth(r1), depth(r2)).max
+  case SEQ(r1, r2) => 1 + List(depth(r1), depth(r2)).max
+  case STAR(r1) => 1 + depth(r1)
+}
+
+
+
+
+
 // expressions (essentially trees)
 
 abstract class Exp
@@ -254,22 +538,44 @@
 parse_date("26.11.2019")
 
 
-// guards in pattern-matching
+
+
+// Map type (upper-case)
+//=======================
+
+// Note the difference between map and Map
+
+val m = Map(1 -> "one", 2 -> "two", 10 -> "many")
+
+List((1, "one"), (2, "two"), (10, "many")).toMap
+
+m.get(1)
+m.get(4)
+
+m.getOrElse(1, "")
+m.getOrElse(4, "")
+
+val new_m = m + (10 -> "ten")
 
-def foo(xs: List[Int]) : String = xs match {
-  case Nil => s"this list is empty"
-  case x :: xs if x % 2 == 0
-     => s"the first elemnt is even"
-  case x :: y :: rest if x == y
-     => s"this has two elemnts that are the same"
-  case hd :: tl => s"this list is standard $hd::$tl"
-}
+new_m.get(10)
+
+val m2 = for ((k, v) <- m) yield (k, v.toUpperCase)
+
+
+
+// groupBy function on Maps
+val lst = List("one", "two", "three", "four", "five")
+lst.groupBy(_.head)
 
-foo(Nil)
-foo(List(1,2,3))
-foo(List(1,2))
-foo(List(1,1,2,3))
-foo(List(2,2,2,3))
+lst.groupBy(_.length)
+
+lst.groupBy(_.length).get(3)
+
+val grps = lst.groupBy(_.length)
+grps.keySet
+
+
+
 
 // Tail recursion
 //================
@@ -316,125 +622,89 @@
 lengthT(List.fill(10000000)(1), 0)
 
 
-// Sudoku
-//========
-
-// uses Strings for games
-
-type Pos = (Int, Int)
-val emptyValue = '.'
-val maxValue = 9
-
-val allValues = "123456789".toList
-val indexes = (0 to 8).toList
 
 
-def empty(game: String) = game.indexOf(emptyValue)
-def isDone(game: String) = empty(game) == -1 
-def emptyPosition(game: String) : Pos = 
-  (empty(game) % maxValue, empty(game) / maxValue)
+
 
 
-def get_row(game: String, y: Int) = indexes.map(col => game(y * maxValue + col))
-def get_col(game: String, x: Int) = indexes.map(row => game(x + row * maxValue))
+// Aside: concurrency 
+// scala-cli --extra-jars scala-parallel-collections_3-1.0.4.jar 
 
-def get_box(game: String, pos: Pos): List[Char] = {
-    def base(p: Int): Int = (p / 3) * 3
-    val x0 = base(pos._1)
-    val y0 = base(pos._2)
-    for (x <- (x0 until x0 + 3).toList;
-         y <- (y0 until y0 + 3).toList) yield game(x + y * maxValue)
-}         
+for (n <- (1 to 10)) println(n)
+
+import scala.collection.parallel.CollectionConverters._
+
+for (n <- (1 to 10).par) println(n)
 
 
-def update(game: String, pos: Int, value: Char): String = 
-  game.updated(pos, value)
-
-def toAvoid(game: String, pos: Pos): List[Char] = 
-  (get_col(game, pos._1) ++ get_row(game, pos._2) ++ get_box(game, pos))
-
-def candidates(game: String, pos: Pos): List[Char] = 
-  allValues.diff(toAvoid(game, pos))
-
-def search(game: String): List[String] = {
-  if (isDone(game)) List(game)
-  else 
-    candidates(game, emptyPosition(game)).
-      map(c => search(update(game, empty(game), c))).flatten
-}
-
-
-def search1T(games: List[String]): Option[String] = games match {
-  case Nil => None
-  case game::rest => {
-    if (isDone(game)) Some(game)
-    else {
-      val cs = candidates(game, emptyPosition(game))
-      search1T(cs.map(c => update(game, empty(game), c)) ::: rest)
-    }
-  }
+// for measuring time
+def time_needed[T](n: Int, code: => T) = {
+  val start = System.nanoTime()
+  for (i <- (0 to n)) code
+  val end = System.nanoTime()
+  (end - start) / 1.0e9
 }
 
-def pretty(game: String): String = 
-  "\n" + (game.sliding(maxValue, maxValue).mkString(",\n"))
+val list = (1L to 10_000_000L).toList
+time_needed(10, for (n <- list) yield n + 42)
+time_needed(10, for (n <- list.par) yield n + 42)
 
-
-// tail recursive version that searches 
-// for all solutions
+// ...but par does not make everything faster
 
-def searchT(games: List[String], sols: List[String]): List[String] = games match {
-  case Nil => sols
-  case game::rest => {
-    if (isDone(game)) searchT(rest, game::sols)
-    else {
-      val cs = candidates(game, emptyPosition(game))
-      searchT(cs.map(c => update(game, empty(game), c)) ::: rest, sols)
-    }
-  }
-}
+list.sum
+list.par.sum
 
-searchT(List(game3), List()).map(pretty)
+time_needed(10, list.sum)
+time_needed(10, list.par.sum)
 
 
-// tail recursive version that searches 
-// for a single solution
+// Mutable vs Immutable
+//======================
+//
+// Remember:
+// - no vars, no ++i, no +=
+// - no mutable data-structures (no Arrays, no ListBuffers)
 
-def search1T(games: List[String]): Option[String] = games match {
-  case Nil => None
-  case game::rest => {
-    if (isDone(game)) Some(game)
-    else {
-      val cs = candidates(game, emptyPosition(game))
-      search1T(cs.map(c => update(game, empty(game), c)) ::: rest)
-    }
-  }
+// But what the heck....lets try to count to 1 Mio in parallel
+// 
+// requires
+// scala-cli --extra-jars scala- parallel-collections_3-1.0.4.jar
+
+import scala.collection.parallel.CollectionConverters._
+
+def test() = {
+  var cnt = 0
+
+  for(i <- (1 to 100_000).par) cnt += 1
+
+  println(s"Should be 100000: $cnt")
 }
 
-search1T(List(game3)).map(pretty)
-time_needed(10, search1T(List(game3)))
+test()
+
+// Or
+// Q: Count how many elements are in the intersections of 
+//    two sets?
+// A; IMPROPER WAY (mutable counter)
+
+def count_intersection(A: Set[Int], B: Set[Int]) : Int = {
+  var count = 0
+  for (x <- A.par; if B contains x) count += 1 
+  count
+}
+
+val A = (0 to 999).toSet
+val B = (0 to 999 by 4).toSet
+
+count_intersection(A, B)
+
+// but do not try to add .par to the for-loop above
 
 
-// game with multiple solutions
-val game3 = """.8...9743
-              |.5...8.1.
-              |.1.......
-              |8....5...
-              |...8.4...
-              |...3....6
-              |.......7.
-              |.3.5...8.
-              |9724...5.""".stripMargin.replaceAll("\\n", "")
+//propper parallel version
+def count_intersection2(A: Set[Int], B: Set[Int]) : Int = 
+  A.par.count(x => B contains x)
 
-searchT(List(game3), Nil).map(pretty)
-search1T(List(game3)).map(pretty)
+count_intersection2(A, B)
 
-// Moral: Whenever a recursive function is resource-critical
-// (i.e. works with large recursion depth), then you need to
-// write it in tail-recursive fashion.
-// 
-// Unfortuantely, Scala because of current limitations in 
-// the JVM is not as clever as other functional languages. It can 
-// only optimise "self-tail calls". This excludes the cases of 
-// multiple functions making tail calls to each other. Well,
-// nothing is perfect.