| author | Christian Urban <urbanc@in.tum.de> | 
| Sat, 09 Jun 2018 21:02:04 +0100 | |
| changeset 552 | 8a79cc0b277c | 
| parent 93 | 4794759139ea | 
| permissions | -rw-r--r-- | 
| 
85
 
1a4065f965fb
added packages
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
75 
diff
changeset
 | 
1  | 
package object matcher {
 | 
| 62 | 2  | 
|
| 
85
 
1a4065f965fb
added packages
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
75 
diff
changeset
 | 
3  | 
// regular expressions  | 
| 
 
1a4065f965fb
added packages
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
75 
diff
changeset
 | 
4  | 
// including constructors for NOT and ALLC  | 
| 
92
 
e85600529ca5
moved scala files
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
85 
diff
changeset
 | 
5  | 
sealed abstract class Rexp  | 
| 62 | 6  | 
|
7  | 
case object NULL extends Rexp  | 
|
8  | 
case object EMPTY extends Rexp  | 
|
| 
75
 
898c25a4e399
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
64 
diff
changeset
 | 
9  | 
case object ALLC extends Rexp // recognises any character  | 
| 62 | 10  | 
case class CHAR(c: Char) extends Rexp  | 
11  | 
case class ALT(r1: Rexp, r2: Rexp) extends Rexp  | 
|
12  | 
case class SEQ(r1: Rexp, r2: Rexp) extends Rexp  | 
|
13  | 
case class STAR(r: Rexp) extends Rexp  | 
|
| 
75
 
898c25a4e399
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
64 
diff
changeset
 | 
14  | 
case class NOT(r: Rexp) extends Rexp // negation of a regular expression  | 
| 62 | 15  | 
|
16  | 
||
17  | 
// nullable function: tests whether the regular  | 
|
18  | 
// expression can recognise the empty string  | 
|
19  | 
def nullable (r: Rexp) : Boolean = r match {
 | 
|
20  | 
case NULL => false  | 
|
21  | 
case EMPTY => true  | 
|
| 
75
 
898c25a4e399
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
64 
diff
changeset
 | 
22  | 
case ALLC => false  | 
| 62 | 23  | 
case CHAR(_) => false  | 
24  | 
case ALT(r1, r2) => nullable(r1) || nullable(r2)  | 
|
25  | 
case SEQ(r1, r2) => nullable(r1) && nullable(r2)  | 
|
26  | 
case STAR(_) => true  | 
|
27  | 
case NOT(r) => !(nullable(r))  | 
|
28  | 
}  | 
|
29  | 
||
30  | 
// tests whether a regular expression  | 
|
31  | 
// cannot recognise more  | 
|
32  | 
def no_more (r: Rexp) : Boolean = r match {
 | 
|
33  | 
case NULL => true  | 
|
34  | 
case EMPTY => false  | 
|
| 
75
 
898c25a4e399
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
64 
diff
changeset
 | 
35  | 
case ALLC => false  | 
| 62 | 36  | 
case CHAR(_) => false  | 
37  | 
case ALT(r1, r2) => no_more(r1) && no_more(r2)  | 
|
38  | 
case SEQ(r1, r2) => if (nullable(r1)) (no_more(r1) && no_more(r2)) else no_more(r1)  | 
|
39  | 
case STAR(_) => false  | 
|
40  | 
case NOT(r) => !(no_more(r))  | 
|
41  | 
}  | 
|
42  | 
||
43  | 
||
44  | 
// derivative of a regular expression w.r.t. a character  | 
|
45  | 
def der (c: Char, r: Rexp) : Rexp = r match {
 | 
|
46  | 
case NULL => NULL  | 
|
| 
75
 
898c25a4e399
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
64 
diff
changeset
 | 
47  | 
case EMPTY => NULL  | 
| 
 
898c25a4e399
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
64 
diff
changeset
 | 
48  | 
case ALLC => EMPTY  | 
| 
 
898c25a4e399
tuned
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
64 
diff
changeset
 | 
49  | 
case CHAR(d) => if (c == d) EMPTY else NULL  | 
| 62 | 50  | 
case ALT(r1, r2) => ALT(der(c, r1), der(c, r2))  | 
51  | 
case SEQ(r1, r2) =>  | 
|
52  | 
if (nullable(r1)) ALT(SEQ(der(c, r1), r2), der(c, r2))  | 
|
53  | 
else SEQ(der(c, r1), r2)  | 
|
54  | 
case STAR(r) => SEQ(der(c, r), STAR(r))  | 
|
55  | 
case NOT(r) => NOT(der (c, r))  | 
|
56  | 
}  | 
|
57  | 
||
| 
85
 
1a4065f965fb
added packages
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
75 
diff
changeset
 | 
58  | 
// main class for the tokenizer  | 
| 
 
1a4065f965fb
added packages
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
75 
diff
changeset
 | 
59  | 
case class Tokenizer[T](rules: List[(Rexp, List[Char] => T)], excl: List[T] = Nil) {
 | 
| 
 
1a4065f965fb
added packages
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
75 
diff
changeset
 | 
60  | 
|
| 
 
1a4065f965fb
added packages
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
75 
diff
changeset
 | 
61  | 
def munch(r: Rexp, action: List[Char] => T, s: List[Char], t: List[Char]) : Option[(List[Char], T)] =  | 
| 
 
1a4065f965fb
added packages
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
75 
diff
changeset
 | 
62  | 
  s match {
 | 
| 
 
1a4065f965fb
added packages
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
75 
diff
changeset
 | 
63  | 
case Nil if (nullable(r)) => Some(Nil, action(t))  | 
| 
 
1a4065f965fb
added packages
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
75 
diff
changeset
 | 
64  | 
case Nil => None  | 
| 
 
1a4065f965fb
added packages
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
75 
diff
changeset
 | 
65  | 
case c::s if (no_more(der (c, r)) && nullable(r)) => Some(c::s, action(t))  | 
| 
 
1a4065f965fb
added packages
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
75 
diff
changeset
 | 
66  | 
case c::s if (no_more(der (c, r))) => None  | 
| 
 
1a4065f965fb
added packages
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
75 
diff
changeset
 | 
67  | 
case c::s => munch(der (c, r), action, s, t ::: List(c))  | 
| 
 
1a4065f965fb
added packages
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
75 
diff
changeset
 | 
68  | 
}  | 
| 
 
1a4065f965fb
added packages
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
75 
diff
changeset
 | 
69  | 
|
| 
 
1a4065f965fb
added packages
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
75 
diff
changeset
 | 
70  | 
def one_token(s: List[Char]) : Either[(List[Char], T), String] = {
 | 
| 
 
1a4065f965fb
added packages
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
75 
diff
changeset
 | 
71  | 
  val somes = rules.map { (r) => munch(r._1, r._2, s, Nil) }.flatten
 | 
| 
 
1a4065f965fb
added packages
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
75 
diff
changeset
 | 
72  | 
if (somes == Nil) Right(s.mkString)  | 
| 
 
1a4065f965fb
added packages
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
75 
diff
changeset
 | 
73  | 
else Left(somes sortBy (_._1.length) head)  | 
| 
 
1a4065f965fb
added packages
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
75 
diff
changeset
 | 
74  | 
}  | 
| 
 
1a4065f965fb
added packages
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
75 
diff
changeset
 | 
75  | 
|
| 
 
1a4065f965fb
added packages
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
75 
diff
changeset
 | 
76  | 
def tokenize(cs: List[Char]) : List[T] = cs match {
 | 
| 
 
1a4065f965fb
added packages
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
75 
diff
changeset
 | 
77  | 
case Nil => Nil  | 
| 
 
1a4065f965fb
added packages
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
75 
diff
changeset
 | 
78  | 
  case _ => one_token(cs) match {
 | 
| 
 
1a4065f965fb
added packages
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
75 
diff
changeset
 | 
79  | 
case Left((rest, token)) => token :: tokenize(rest)  | 
| 
 
1a4065f965fb
added packages
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
75 
diff
changeset
 | 
80  | 
    case Right(s) => { println("Cannot tokenize: \"" + s + "\""); Nil } 
 | 
| 
 
1a4065f965fb
added packages
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
75 
diff
changeset
 | 
81  | 
}  | 
| 
 
1a4065f965fb
added packages
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
75 
diff
changeset
 | 
82  | 
}  | 
| 
 
1a4065f965fb
added packages
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
75 
diff
changeset
 | 
83  | 
|
| 
 
1a4065f965fb
added packages
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
75 
diff
changeset
 | 
84  | 
def fromString(s: String) : List[T] =  | 
| 
 
1a4065f965fb
added packages
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
75 
diff
changeset
 | 
85  | 
tokenize(s.toList).filterNot(excl.contains(_))  | 
| 
 
1a4065f965fb
added packages
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
75 
diff
changeset
 | 
86  | 
|
| 
 
1a4065f965fb
added packages
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
75 
diff
changeset
 | 
87  | 
def fromFile(name: String) : List[T] =  | 
| 
 
1a4065f965fb
added packages
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
75 
diff
changeset
 | 
88  | 
fromString(io.Source.fromFile(name).mkString)  | 
| 
 
1a4065f965fb
added packages
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
75 
diff
changeset
 | 
89  | 
|
| 
 
1a4065f965fb
added packages
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
75 
diff
changeset
 | 
90  | 
}  | 
| 
 
1a4065f965fb
added packages
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
75 
diff
changeset
 | 
91  | 
|
| 
 
1a4065f965fb
added packages
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
75 
diff
changeset
 | 
92  | 
|
| 62 | 93  | 
// regular expression for specifying  | 
94  | 
// ranges of characters  | 
|
| 
64
 
2d625418c011
added everything
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
62 
diff
changeset
 | 
95  | 
def Range(s : List[Char]) : Rexp = s match {
 | 
| 62 | 96  | 
case Nil => NULL  | 
97  | 
case c::Nil => CHAR(c)  | 
|
| 
64
 
2d625418c011
added everything
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
62 
diff
changeset
 | 
98  | 
case c::s => ALT(CHAR(c), Range(s))  | 
| 62 | 99  | 
}  | 
| 
64
 
2d625418c011
added everything
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
62 
diff
changeset
 | 
100  | 
def RANGE(s: String) = Range(s.toList)  | 
| 
 
2d625418c011
added everything
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
62 
diff
changeset
 | 
101  | 
|
| 62 | 102  | 
|
103  | 
// one or more  | 
|
104  | 
def PLUS(r: Rexp) = SEQ(r, STAR(r))  | 
|
105  | 
||
| 
64
 
2d625418c011
added everything
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
62 
diff
changeset
 | 
106  | 
// many alternatives  | 
| 
 
2d625418c011
added everything
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
62 
diff
changeset
 | 
107  | 
def Alts(rs: List[Rexp]) : Rexp = rs match {
 | 
| 
 
2d625418c011
added everything
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
62 
diff
changeset
 | 
108  | 
case Nil => NULL  | 
| 
 
2d625418c011
added everything
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
62 
diff
changeset
 | 
109  | 
case r::Nil => r  | 
| 
 
2d625418c011
added everything
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
62 
diff
changeset
 | 
110  | 
case r::rs => ALT(r, Alts(rs))  | 
| 
 
2d625418c011
added everything
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
62 
diff
changeset
 | 
111  | 
}  | 
| 
 
2d625418c011
added everything
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
62 
diff
changeset
 | 
112  | 
def ALTS(rs: Rexp*) = Alts(rs.toList)  | 
| 
 
2d625418c011
added everything
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
62 
diff
changeset
 | 
113  | 
|
| 
 
2d625418c011
added everything
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
62 
diff
changeset
 | 
114  | 
// repetitions  | 
| 
 
2d625418c011
added everything
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
62 
diff
changeset
 | 
115  | 
def Seqs(rs: List[Rexp]) : Rexp = rs match {
 | 
| 
 
2d625418c011
added everything
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
62 
diff
changeset
 | 
116  | 
case Nil => NULL  | 
| 
 
2d625418c011
added everything
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
62 
diff
changeset
 | 
117  | 
case r::Nil => r  | 
| 
 
2d625418c011
added everything
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
62 
diff
changeset
 | 
118  | 
case r::rs => SEQ(r, Seqs(rs))  | 
| 
 
2d625418c011
added everything
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
62 
diff
changeset
 | 
119  | 
}  | 
| 
 
2d625418c011
added everything
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
62 
diff
changeset
 | 
120  | 
def SEQS(rs: Rexp*) = Seqs(rs.toList)  | 
| 
 
2d625418c011
added everything
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
62 
diff
changeset
 | 
121  | 
|
| 
 
2d625418c011
added everything
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
62 
diff
changeset
 | 
122  | 
// some convenience for typing in regular expressions  | 
| 
 
2d625418c011
added everything
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
62 
diff
changeset
 | 
123  | 
def charlist2rexp(s : List[Char]) : Rexp = s match {
 | 
| 
 
2d625418c011
added everything
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
62 
diff
changeset
 | 
124  | 
case Nil => EMPTY  | 
| 
 
2d625418c011
added everything
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
62 
diff
changeset
 | 
125  | 
case c::Nil => CHAR(c)  | 
| 
 
2d625418c011
added everything
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
62 
diff
changeset
 | 
126  | 
case c::s => SEQ(CHAR(c), charlist2rexp(s))  | 
| 
 
2d625418c011
added everything
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
62 
diff
changeset
 | 
127  | 
}  | 
| 
 
2d625418c011
added everything
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
62 
diff
changeset
 | 
128  | 
implicit def string2rexp(s : String) : Rexp = charlist2rexp(s.toList)  | 
| 
 
2d625418c011
added everything
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
62 
diff
changeset
 | 
129  | 
|
| 62 | 130  | 
}  |