51 else SEQ(der(c, r1), r2) |
53 else SEQ(der(c, r1), r2) |
52 case STAR(r) => SEQ(der(c, r), STAR(r)) |
54 case STAR(r) => SEQ(der(c, r), STAR(r)) |
53 case NOT(r) => NOT(der (c, r)) |
55 case NOT(r) => NOT(der (c, r)) |
54 } |
56 } |
55 |
57 |
|
58 // main class for the tokenizer |
|
59 case class Tokenizer[T](rules: List[(Rexp, List[Char] => T)], excl: List[T] = Nil) { |
|
60 |
|
61 def munch(r: Rexp, action: List[Char] => T, s: List[Char], t: List[Char]) : Option[(List[Char], T)] = |
|
62 s match { |
|
63 case Nil if (nullable(r)) => Some(Nil, action(t)) |
|
64 case Nil => None |
|
65 case c::s if (no_more(der (c, r)) && nullable(r)) => Some(c::s, action(t)) |
|
66 case c::s if (no_more(der (c, r))) => None |
|
67 case c::s => munch(der (c, r), action, s, t ::: List(c)) |
|
68 } |
|
69 |
|
70 def one_token(s: List[Char]) : Either[(List[Char], T), String] = { |
|
71 val somes = rules.map { (r) => munch(r._1, r._2, s, Nil) }.flatten |
|
72 if (somes == Nil) Right(s.mkString) |
|
73 else Left(somes sortBy (_._1.length) head) |
|
74 } |
|
75 |
|
76 def tokenize(cs: List[Char]) : List[T] = cs match { |
|
77 case Nil => Nil |
|
78 case _ => one_token(cs) match { |
|
79 case Left((rest, token)) => token :: tokenize(rest) |
|
80 case Right(s) => { println("Cannot tokenize: \"" + s + "\""); Nil } |
|
81 } |
|
82 } |
|
83 |
|
84 def fromString(s: String) : List[T] = |
|
85 tokenize(s.toList).filterNot(excl.contains(_)) |
|
86 |
|
87 def fromFile(name: String) : List[T] = |
|
88 fromString(io.Source.fromFile(name).mkString) |
|
89 |
|
90 } |
|
91 |
|
92 |
56 // regular expression for specifying |
93 // regular expression for specifying |
57 // ranges of characters |
94 // ranges of characters |
58 def Range(s : List[Char]) : Rexp = s match { |
95 def Range(s : List[Char]) : Rexp = s match { |
59 case Nil => NULL |
96 case Nil => NULL |
60 case c::Nil => CHAR(c) |
97 case c::Nil => CHAR(c) |
88 case c::Nil => CHAR(c) |
125 case c::Nil => CHAR(c) |
89 case c::s => SEQ(CHAR(c), charlist2rexp(s)) |
126 case c::s => SEQ(CHAR(c), charlist2rexp(s)) |
90 } |
127 } |
91 implicit def string2rexp(s : String) : Rexp = charlist2rexp(s.toList) |
128 implicit def string2rexp(s : String) : Rexp = charlist2rexp(s.toList) |
92 |
129 |
93 |
|
94 type Rule[T] = (Rexp, List[Char] => T) |
|
95 |
|
96 case class Tokenizer[T](rules: List[Rule[T]], excl: List[T] = Nil) { |
|
97 |
|
98 def munch(r: Rexp, action: List[Char] => T, s: List[Char], t: List[Char]) : Option[(List[Char], T)] = |
|
99 s match { |
|
100 case Nil if (nullable(r)) => Some(Nil, action(t)) |
|
101 case Nil => None |
|
102 case c::s if (no_more(der (c, r)) && nullable(r)) => Some(c::s, action(t)) |
|
103 case c::s if (no_more(der (c, r))) => None |
|
104 case c::s => munch(der (c, r), action, s, t ::: List(c)) |
|
105 } |
|
106 |
|
107 def one_token(s: List[Char]) : Either[(List[Char], T), String] = { |
|
108 val somes = rules.map { (r) => munch(r._1, r._2, s, Nil) }.flatten |
|
109 if (somes == Nil) Right(s.mkString) |
|
110 else Left(somes sortBy (_._1.length) head) |
|
111 } |
|
112 |
|
113 def tokenize(cs: List[Char]) : List[T] = cs match { |
|
114 case Nil => Nil |
|
115 case _ => one_token(cs) match { |
|
116 case Left((rest, token)) => token :: tokenize(rest) |
|
117 case Right(s) => { println("Cannot tokenize: \"" + s + "\""); Nil } |
|
118 } |
|
119 } |
|
120 |
|
121 def fromString(s: String) : List[T] = |
|
122 tokenize(s.toList).filterNot(excl.contains(_)) |
|
123 |
|
124 def fromFile(name: String) : List[T] = |
|
125 fromString(io.Source.fromFile(name).mkString) |
|
126 |
|
127 } |
130 } |
128 |
|