author | Christian Urban <christian dot urban at kcl dot ac dot uk> |
Fri, 06 May 2016 11:33:21 +0100 | |
changeset 168 | 6b0a1976f89a |
child 169 | 072a701bb153 |
permissions | -rw-r--r-- |
168
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
1 |
import scala.language.implicitConversions |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
2 |
import scala.language.reflectiveCalls |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
3 |
import scala.annotation.tailrec |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
4 |
import scala.io.Source |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
5 |
import scala.util.parsing.combinator._ |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
6 |
|
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
7 |
abstract class Rexp |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
8 |
case object ZERO extends Rexp |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
9 |
case object ONE extends Rexp |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
10 |
case class CHAR(c: Char) extends Rexp { |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
11 |
override def toString = c.toString |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
12 |
} |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
13 |
case class ALT(r1: Rexp, r2: Rexp) extends Rexp { |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
14 |
override def toString = "(" + r1.toString + "|" + r2.toString + ")" |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
15 |
} |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
16 |
case class SEQ(r1: Rexp, r2: Rexp) extends Rexp { |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
17 |
override def toString = "(" + r1.toString + r2.toString +")" |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
18 |
} |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
19 |
case class STAR(r: Rexp) extends Rexp |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
20 |
case class RECD(x: String, r: Rexp) extends Rexp |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
21 |
|
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
22 |
case class Parser(s: String) { |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
23 |
var i = 0 |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
24 |
|
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
25 |
def peek() = s(i) |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
26 |
def eat(c: Char) = |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
27 |
if (c == s(i)) i = i + 1 else throw new Exception("Expected " + c + " got " + s(i)) |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
28 |
def next() = { i = i + 1; s(i - 1) } |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
29 |
def more() = s.length - i > 0 |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
30 |
|
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
31 |
def Regex() : Rexp = { |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
32 |
val t = Term(); |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
33 |
if (more() && peek() == '|') { |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
34 |
eat ('|') ; |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
35 |
val r = Regex(); |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
36 |
ALT(t, r) |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
37 |
} |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
38 |
else t |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
39 |
} |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
40 |
|
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
41 |
def Term() : Rexp = { |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
42 |
var f : Rexp = if (more() && peek() != ')' && peek() != '|') Factor() else ZERO; |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
43 |
while (more() && peek() != ')' && peek() != '|') { |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
44 |
var nextf = Factor(); |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
45 |
f = SEQ(f, nextf) ; |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
46 |
} |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
47 |
|
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
48 |
f |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
49 |
} |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
50 |
|
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
51 |
def Factor() : Rexp = { |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
52 |
var b = Base(); |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
53 |
while (more() && peek() == '*') { |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
54 |
eat('*') ; |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
55 |
b = STAR(b) ; |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
56 |
} |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
57 |
|
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
58 |
b |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
59 |
} |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
60 |
|
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
61 |
def Base() : Rexp = { |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
62 |
peek() match { |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
63 |
case '(' => { eat('(') ; val r = Regex(); eat(')') ; r } |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
64 |
case _ => CHAR(next()) |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
65 |
} |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
66 |
} |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
67 |
} |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
68 |
|
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
69 |
println(Parser("a|(bc)*").Regex()) |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
70 |
|
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
71 |
|
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
72 |
def process(line: String) : String = { |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
73 |
val s = line.split("\\t+")(1) |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
74 |
s + ": " + Parser(s).Regex().toString |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
75 |
} |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
76 |
|
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
77 |
|
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
78 |
val filename = "../tests/forced-assoc.txt" |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
79 |
val filelines : List[String] = Source.fromFile(filename).getLines.toList |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
80 |
|
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
81 |
|
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
82 |
filelines.foreach((s: String) => println(process(s))) |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
83 |