author | Christian Urban <christian dot urban at kcl dot ac dot uk> |
Tue, 17 May 2016 03:47:33 +0100 | |
changeset 180 | 42ffaca7c85e |
parent 169 | 072a701bb153 |
child 195 | c2d36c3cf8ad |
permissions | -rw-r--r-- |
168
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
1 |
import scala.language.implicitConversions |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
2 |
import scala.language.reflectiveCalls |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
3 |
import scala.annotation.tailrec |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
4 |
import scala.io.Source |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
5 |
import scala.util.parsing.combinator._ |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
6 |
|
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
7 |
abstract class Rexp |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
8 |
case object ZERO extends Rexp |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
9 |
case object ONE extends Rexp |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
10 |
case class CHAR(c: Char) extends Rexp { |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
11 |
override def toString = c.toString |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
12 |
} |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
13 |
case class ALT(r1: Rexp, r2: Rexp) extends Rexp { |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
14 |
override def toString = "(" + r1.toString + "|" + r2.toString + ")" |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
15 |
} |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
16 |
case class SEQ(r1: Rexp, r2: Rexp) extends Rexp { |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
17 |
override def toString = "(" + r1.toString + r2.toString +")" |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
18 |
} |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
19 |
case class STAR(r: Rexp) extends Rexp |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
20 |
case class RECD(x: String, r: Rexp) extends Rexp |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
21 |
|
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
22 |
case class Parser(s: String) { |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
23 |
var i = 0 |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
24 |
|
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
25 |
def peek() = s(i) |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
26 |
def eat(c: Char) = |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
27 |
if (c == s(i)) i = i + 1 else throw new Exception("Expected " + c + " got " + s(i)) |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
28 |
def next() = { i = i + 1; s(i - 1) } |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
29 |
def more() = s.length - i > 0 |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
30 |
|
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
31 |
def Regex() : Rexp = { |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
32 |
val t = Term(); |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
33 |
if (more() && peek() == '|') { |
169
072a701bb153
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
168
diff
changeset
|
34 |
eat ('|') ; |
072a701bb153
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
168
diff
changeset
|
35 |
ALT(t, Regex()) |
168
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
36 |
} |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
37 |
else t |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
38 |
} |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
39 |
|
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
40 |
def Term() : Rexp = { |
169
072a701bb153
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
168
diff
changeset
|
41 |
var f : Rexp = |
072a701bb153
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
168
diff
changeset
|
42 |
if (more() && peek() != ')' && peek() != '|') Factor() else ZERO; |
168
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
43 |
while (more() && peek() != ')' && peek() != '|') { |
169
072a701bb153
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
168
diff
changeset
|
44 |
f = SEQ(f, Factor()) ; |
168
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
45 |
} |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
46 |
|
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
47 |
f |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
48 |
} |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
49 |
|
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
50 |
def Factor() : Rexp = { |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
51 |
var b = Base(); |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
52 |
while (more() && peek() == '*') { |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
53 |
eat('*') ; |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
54 |
b = STAR(b) ; |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
55 |
} |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
56 |
|
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
57 |
b |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
58 |
} |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
59 |
|
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
60 |
def Base() : Rexp = { |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
61 |
peek() match { |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
62 |
case '(' => { eat('(') ; val r = Regex(); eat(')') ; r } |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
63 |
case _ => CHAR(next()) |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
64 |
} |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
65 |
} |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
66 |
} |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
67 |
|
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
68 |
println(Parser("a|(bc)*").Regex()) |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
69 |
|
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
70 |
|
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
71 |
def process(line: String) : String = { |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
72 |
val s = line.split("\\t+")(1) |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
73 |
s + ": " + Parser(s).Regex().toString |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
74 |
} |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
75 |
|
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
76 |
|
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
77 |
val filename = "../tests/forced-assoc.txt" |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
78 |
val filelines : List[String] = Source.fromFile(filename).getLines.toList |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
79 |
|
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
80 |
|
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
81 |
filelines.foreach((s: String) => println(process(s))) |
6b0a1976f89a
added parser for regexes
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
82 |