|
1 section "Derivatives of regular expressions" |
|
2 |
|
3 (* Author: Christian Urban *) |
|
4 |
|
5 theory Derivatives |
|
6 imports Regular_Exp |
|
7 begin |
|
8 |
|
9 text{* This theory is based on work by Brozowski \cite{Brzozowski64} and Antimirov \cite{Antimirov95}. *} |
|
10 |
|
11 subsection {* Brzozowski's derivatives of regular expressions *} |
|
12 |
|
13 primrec |
|
14 deriv :: "'a \<Rightarrow> 'a rexp \<Rightarrow> 'a rexp" |
|
15 where |
|
16 "deriv c (Zero) = Zero" |
|
17 | "deriv c (One) = Zero" |
|
18 | "deriv c (Atom c') = (if c = c' then One else Zero)" |
|
19 | "deriv c (Plus r1 r2) = Plus (deriv c r1) (deriv c r2)" |
|
20 | "deriv c (Times r1 r2) = |
|
21 (if nullable r1 then Plus (Times (deriv c r1) r2) (deriv c r2) else Times (deriv c r1) r2)" |
|
22 | "deriv c (Star r) = Times (deriv c r) (Star r)" |
|
23 |
|
24 primrec |
|
25 derivs :: "'a list \<Rightarrow> 'a rexp \<Rightarrow> 'a rexp" |
|
26 where |
|
27 "derivs [] r = r" |
|
28 | "derivs (c # s) r = derivs s (deriv c r)" |
|
29 |
|
30 |
|
31 lemma atoms_deriv_subset: "atoms (deriv x r) \<subseteq> atoms r" |
|
32 by (induction r) (auto) |
|
33 |
|
34 lemma atoms_derivs_subset: "atoms (derivs w r) \<subseteq> atoms r" |
|
35 by (induction w arbitrary: r) (auto dest: atoms_deriv_subset[THEN subsetD]) |
|
36 |
|
37 lemma lang_deriv: "lang (deriv c r) = Deriv c (lang r)" |
|
38 by (induct r) (simp_all add: nullable_iff) |
|
39 |
|
40 lemma lang_derivs: "lang (derivs s r) = Derivs s (lang r)" |
|
41 by (induct s arbitrary: r) (simp_all add: lang_deriv) |
|
42 |
|
43 text {* A regular expression matcher: *} |
|
44 |
|
45 definition matcher :: "'a rexp \<Rightarrow> 'a list \<Rightarrow> bool" where |
|
46 "matcher r s = nullable (derivs s r)" |
|
47 |
|
48 lemma matcher_correctness: "matcher r s \<longleftrightarrow> s \<in> lang r" |
|
49 by (induct s arbitrary: r) |
|
50 (simp_all add: nullable_iff lang_deriv matcher_def Deriv_def) |
|
51 |
|
52 |
|
53 subsection {* Antimirov's partial derivatives *} |
|
54 |
|
55 abbreviation |
|
56 "Timess rs r \<equiv> (\<Union>r' \<in> rs. {Times r' r})" |
|
57 |
|
58 primrec |
|
59 pderiv :: "'a \<Rightarrow> 'a rexp \<Rightarrow> 'a rexp set" |
|
60 where |
|
61 "pderiv c Zero = {}" |
|
62 | "pderiv c One = {}" |
|
63 | "pderiv c (Atom c') = (if c = c' then {One} else {})" |
|
64 | "pderiv c (Plus r1 r2) = (pderiv c r1) \<union> (pderiv c r2)" |
|
65 | "pderiv c (Times r1 r2) = |
|
66 (if nullable r1 then Timess (pderiv c r1) r2 \<union> pderiv c r2 else Timess (pderiv c r1) r2)" |
|
67 | "pderiv c (Star r) = Timess (pderiv c r) (Star r)" |
|
68 |
|
69 primrec |
|
70 pderivs :: "'a list \<Rightarrow> 'a rexp \<Rightarrow> ('a rexp) set" |
|
71 where |
|
72 "pderivs [] r = {r}" |
|
73 | "pderivs (c # s) r = \<Union> (pderivs s ` pderiv c r)" |
|
74 |
|
75 abbreviation |
|
76 pderiv_set :: "'a \<Rightarrow> 'a rexp set \<Rightarrow> 'a rexp set" |
|
77 where |
|
78 "pderiv_set c rs \<equiv> \<Union> (pderiv c ` rs)" |
|
79 |
|
80 abbreviation |
|
81 pderivs_set :: "'a list \<Rightarrow> 'a rexp set \<Rightarrow> 'a rexp set" |
|
82 where |
|
83 "pderivs_set s rs \<equiv> \<Union> (pderivs s ` rs)" |
|
84 |
|
85 lemma pderivs_append: |
|
86 "pderivs (s1 @ s2) r = \<Union> (pderivs s2 ` pderivs s1 r)" |
|
87 by (induct s1 arbitrary: r) (simp_all) |
|
88 |
|
89 lemma pderivs_snoc: |
|
90 shows "pderivs (s @ [c]) r = pderiv_set c (pderivs s r)" |
|
91 by (simp add: pderivs_append) |
|
92 |
|
93 lemma pderivs_simps [simp]: |
|
94 shows "pderivs s Zero = (if s = [] then {Zero} else {})" |
|
95 and "pderivs s One = (if s = [] then {One} else {})" |
|
96 and "pderivs s (Plus r1 r2) = (if s = [] then {Plus r1 r2} else (pderivs s r1) \<union> (pderivs s r2))" |
|
97 by (induct s) (simp_all) |
|
98 |
|
99 lemma pderivs_Atom: |
|
100 shows "pderivs s (Atom c) \<subseteq> {Atom c, One}" |
|
101 by (induct s) (simp_all) |
|
102 |
|
103 subsection {* Relating left-quotients and partial derivatives *} |
|
104 |
|
105 lemma Deriv_pderiv: |
|
106 shows "Deriv c (lang r) = \<Union> (lang ` pderiv c r)" |
|
107 by (induct r) (auto simp add: nullable_iff conc_UNION_distrib) |
|
108 |
|
109 lemma Derivs_pderivs: |
|
110 shows "Derivs s (lang r) = \<Union> (lang ` pderivs s r)" |
|
111 proof (induct s arbitrary: r) |
|
112 case (Cons c s) |
|
113 have ih: "\<And>r. Derivs s (lang r) = \<Union> (lang ` pderivs s r)" by fact |
|
114 have "Derivs (c # s) (lang r) = Derivs s (Deriv c (lang r))" by simp |
|
115 also have "\<dots> = Derivs s (\<Union> (lang ` pderiv c r))" by (simp add: Deriv_pderiv) |
|
116 also have "\<dots> = Derivss s (lang ` (pderiv c r))" |
|
117 by (auto simp add: Derivs_def) |
|
118 also have "\<dots> = \<Union> (lang ` (pderivs_set s (pderiv c r)))" |
|
119 using ih by auto |
|
120 also have "\<dots> = \<Union> (lang ` (pderivs (c # s) r))" by simp |
|
121 finally show "Derivs (c # s) (lang r) = \<Union> (lang ` pderivs (c # s) r)" . |
|
122 qed (simp add: Derivs_def) |
|
123 |
|
124 subsection {* Relating derivatives and partial derivatives *} |
|
125 |
|
126 lemma deriv_pderiv: |
|
127 shows "\<Union> (lang ` (pderiv c r)) = lang (deriv c r)" |
|
128 unfolding lang_deriv Deriv_pderiv by simp |
|
129 |
|
130 lemma derivs_pderivs: |
|
131 shows "\<Union> (lang ` (pderivs s r)) = lang (derivs s r)" |
|
132 unfolding lang_derivs Derivs_pderivs by simp |
|
133 |
|
134 |
|
135 subsection {* Finiteness property of partial derivatives *} |
|
136 |
|
137 definition |
|
138 pderivs_lang :: "'a lang \<Rightarrow> 'a rexp \<Rightarrow> 'a rexp set" |
|
139 where |
|
140 "pderivs_lang A r \<equiv> \<Union>x \<in> A. pderivs x r" |
|
141 |
|
142 lemma pderivs_lang_subsetI: |
|
143 assumes "\<And>s. s \<in> A \<Longrightarrow> pderivs s r \<subseteq> C" |
|
144 shows "pderivs_lang A r \<subseteq> C" |
|
145 using assms unfolding pderivs_lang_def by (rule UN_least) |
|
146 |
|
147 lemma pderivs_lang_union: |
|
148 shows "pderivs_lang (A \<union> B) r = (pderivs_lang A r \<union> pderivs_lang B r)" |
|
149 by (simp add: pderivs_lang_def) |
|
150 |
|
151 lemma pderivs_lang_subset: |
|
152 shows "A \<subseteq> B \<Longrightarrow> pderivs_lang A r \<subseteq> pderivs_lang B r" |
|
153 by (auto simp add: pderivs_lang_def) |
|
154 |
|
155 definition |
|
156 "UNIV1 \<equiv> UNIV - {[]}" |
|
157 |
|
158 lemma pderivs_lang_Zero [simp]: |
|
159 shows "pderivs_lang UNIV1 Zero = {}" |
|
160 unfolding UNIV1_def pderivs_lang_def by auto |
|
161 |
|
162 lemma pderivs_lang_One [simp]: |
|
163 shows "pderivs_lang UNIV1 One = {}" |
|
164 unfolding UNIV1_def pderivs_lang_def by (auto split: if_splits) |
|
165 |
|
166 lemma pderivs_lang_Atom [simp]: |
|
167 shows "pderivs_lang UNIV1 (Atom c) = {One}" |
|
168 unfolding UNIV1_def pderivs_lang_def |
|
169 apply(auto) |
|
170 apply(frule rev_subsetD) |
|
171 apply(rule pderivs_Atom) |
|
172 apply(simp) |
|
173 apply(case_tac xa) |
|
174 apply(auto split: if_splits) |
|
175 done |
|
176 |
|
177 lemma pderivs_lang_Plus [simp]: |
|
178 shows "pderivs_lang UNIV1 (Plus r1 r2) = pderivs_lang UNIV1 r1 \<union> pderivs_lang UNIV1 r2" |
|
179 unfolding UNIV1_def pderivs_lang_def by auto |
|
180 |
|
181 |
|
182 text {* Non-empty suffixes of a string (needed for the cases of @{const Times} and @{const Star} below) *} |
|
183 |
|
184 definition |
|
185 "PSuf s \<equiv> {v. v \<noteq> [] \<and> (\<exists>u. u @ v = s)}" |
|
186 |
|
187 lemma PSuf_snoc: |
|
188 shows "PSuf (s @ [c]) = (PSuf s) @@ {[c]} \<union> {[c]}" |
|
189 unfolding PSuf_def conc_def |
|
190 by (auto simp add: append_eq_append_conv2 append_eq_Cons_conv) |
|
191 |
|
192 lemma PSuf_Union: |
|
193 shows "(\<Union>v \<in> PSuf s @@ {[c]}. f v) = (\<Union>v \<in> PSuf s. f (v @ [c]))" |
|
194 by (auto simp add: conc_def) |
|
195 |
|
196 lemma pderivs_lang_snoc: |
|
197 shows "pderivs_lang (PSuf s @@ {[c]}) r = (pderiv_set c (pderivs_lang (PSuf s) r))" |
|
198 unfolding pderivs_lang_def |
|
199 by (simp add: PSuf_Union pderivs_snoc) |
|
200 |
|
201 lemma pderivs_Times: |
|
202 shows "pderivs s (Times r1 r2) \<subseteq> Timess (pderivs s r1) r2 \<union> (pderivs_lang (PSuf s) r2)" |
|
203 proof (induct s rule: rev_induct) |
|
204 case (snoc c s) |
|
205 have ih: "pderivs s (Times r1 r2) \<subseteq> Timess (pderivs s r1) r2 \<union> (pderivs_lang (PSuf s) r2)" |
|
206 by fact |
|
207 have "pderivs (s @ [c]) (Times r1 r2) = pderiv_set c (pderivs s (Times r1 r2))" |
|
208 by (simp add: pderivs_snoc) |
|
209 also have "\<dots> \<subseteq> pderiv_set c (Timess (pderivs s r1) r2 \<union> (pderivs_lang (PSuf s) r2))" |
|
210 using ih by fast |
|
211 also have "\<dots> = pderiv_set c (Timess (pderivs s r1) r2) \<union> pderiv_set c (pderivs_lang (PSuf s) r2)" |
|
212 by (simp) |
|
213 also have "\<dots> = pderiv_set c (Timess (pderivs s r1) r2) \<union> pderivs_lang (PSuf s @@ {[c]}) r2" |
|
214 by (simp add: pderivs_lang_snoc) |
|
215 also |
|
216 have "\<dots> \<subseteq> pderiv_set c (Timess (pderivs s r1) r2) \<union> pderiv c r2 \<union> pderivs_lang (PSuf s @@ {[c]}) r2" |
|
217 by auto |
|
218 also |
|
219 have "\<dots> \<subseteq> Timess (pderiv_set c (pderivs s r1)) r2 \<union> pderiv c r2 \<union> pderivs_lang (PSuf s @@ {[c]}) r2" |
|
220 by (auto simp add: if_splits) |
|
221 also have "\<dots> = Timess (pderivs (s @ [c]) r1) r2 \<union> pderiv c r2 \<union> pderivs_lang (PSuf s @@ {[c]}) r2" |
|
222 by (simp add: pderivs_snoc) |
|
223 also have "\<dots> \<subseteq> Timess (pderivs (s @ [c]) r1) r2 \<union> pderivs_lang (PSuf (s @ [c])) r2" |
|
224 unfolding pderivs_lang_def by (auto simp add: PSuf_snoc) |
|
225 finally show ?case . |
|
226 qed (simp) |
|
227 |
|
228 lemma pderivs_lang_Times_aux1: |
|
229 assumes a: "s \<in> UNIV1" |
|
230 shows "pderivs_lang (PSuf s) r \<subseteq> pderivs_lang UNIV1 r" |
|
231 using a unfolding UNIV1_def PSuf_def pderivs_lang_def by auto |
|
232 |
|
233 lemma pderivs_lang_Times_aux2: |
|
234 assumes a: "s \<in> UNIV1" |
|
235 shows "Timess (pderivs s r1) r2 \<subseteq> Timess (pderivs_lang UNIV1 r1) r2" |
|
236 using a unfolding pderivs_lang_def by auto |
|
237 |
|
238 lemma pderivs_lang_Times: |
|
239 shows "pderivs_lang UNIV1 (Times r1 r2) \<subseteq> Timess (pderivs_lang UNIV1 r1) r2 \<union> pderivs_lang UNIV1 r2" |
|
240 apply(rule pderivs_lang_subsetI) |
|
241 apply(rule subset_trans) |
|
242 apply(rule pderivs_Times) |
|
243 using pderivs_lang_Times_aux1 pderivs_lang_Times_aux2 |
|
244 apply(blast) |
|
245 done |
|
246 |
|
247 lemma pderivs_Star: |
|
248 assumes a: "s \<noteq> []" |
|
249 shows "pderivs s (Star r) \<subseteq> Timess (pderivs_lang (PSuf s) r) (Star r)" |
|
250 using a |
|
251 proof (induct s rule: rev_induct) |
|
252 case (snoc c s) |
|
253 have ih: "s \<noteq> [] \<Longrightarrow> pderivs s (Star r) \<subseteq> Timess (pderivs_lang (PSuf s) r) (Star r)" by fact |
|
254 { assume asm: "s \<noteq> []" |
|
255 have "pderivs (s @ [c]) (Star r) = pderiv_set c (pderivs s (Star r))" by (simp add: pderivs_snoc) |
|
256 also have "\<dots> \<subseteq> pderiv_set c (Timess (pderivs_lang (PSuf s) r) (Star r))" |
|
257 using ih[OF asm] by fast |
|
258 also have "\<dots> \<subseteq> Timess (pderiv_set c (pderivs_lang (PSuf s) r)) (Star r) \<union> pderiv c (Star r)" |
|
259 by (auto split: if_splits) |
|
260 also have "\<dots> \<subseteq> Timess (pderivs_lang (PSuf (s @ [c])) r) (Star r) \<union> (Timess (pderiv c r) (Star r))" |
|
261 by (simp only: PSuf_snoc pderivs_lang_snoc pderivs_lang_union) |
|
262 (auto simp add: pderivs_lang_def) |
|
263 also have "\<dots> = Timess (pderivs_lang (PSuf (s @ [c])) r) (Star r)" |
|
264 by (auto simp add: PSuf_snoc PSuf_Union pderivs_snoc pderivs_lang_def) |
|
265 finally have ?case . |
|
266 } |
|
267 moreover |
|
268 { assume asm: "s = []" |
|
269 then have ?case by (auto simp add: pderivs_lang_def pderivs_snoc PSuf_def) |
|
270 } |
|
271 ultimately show ?case by blast |
|
272 qed (simp) |
|
273 |
|
274 lemma pderivs_lang_Star: |
|
275 shows "pderivs_lang UNIV1 (Star r) \<subseteq> Timess (pderivs_lang UNIV1 r) (Star r)" |
|
276 apply(rule pderivs_lang_subsetI) |
|
277 apply(rule subset_trans) |
|
278 apply(rule pderivs_Star) |
|
279 apply(simp add: UNIV1_def) |
|
280 apply(simp add: UNIV1_def PSuf_def) |
|
281 apply(auto simp add: pderivs_lang_def) |
|
282 done |
|
283 |
|
284 lemma finite_Timess [simp]: |
|
285 assumes a: "finite A" |
|
286 shows "finite (Timess A r)" |
|
287 using a by auto |
|
288 |
|
289 lemma finite_pderivs_lang_UNIV1: |
|
290 shows "finite (pderivs_lang UNIV1 r)" |
|
291 apply(induct r) |
|
292 apply(simp_all add: |
|
293 finite_subset[OF pderivs_lang_Times] |
|
294 finite_subset[OF pderivs_lang_Star]) |
|
295 done |
|
296 |
|
297 lemma pderivs_lang_UNIV: |
|
298 shows "pderivs_lang UNIV r = pderivs [] r \<union> pderivs_lang UNIV1 r" |
|
299 unfolding UNIV1_def pderivs_lang_def |
|
300 by blast |
|
301 |
|
302 lemma finite_pderivs_lang_UNIV: |
|
303 shows "finite (pderivs_lang UNIV r)" |
|
304 unfolding pderivs_lang_UNIV |
|
305 by (simp add: finite_pderivs_lang_UNIV1) |
|
306 |
|
307 lemma finite_pderivs_lang: |
|
308 shows "finite (pderivs_lang A r)" |
|
309 by (metis finite_pderivs_lang_UNIV pderivs_lang_subset rev_finite_subset subset_UNIV) |
|
310 |
|
311 |
|
312 text{* The following relationship between the alphabetic width of regular expressions |
|
313 (called @{text awidth} below) and the number of partial derivatives was proved |
|
314 by Antimirov~\cite{Antimirov95} and formalized by Max Haslbeck. *} |
|
315 |
|
316 fun awidth :: "'a rexp \<Rightarrow> nat" where |
|
317 "awidth Zero = 0" | |
|
318 "awidth One = 0" | |
|
319 "awidth (Atom a) = 1" | |
|
320 "awidth (Plus r1 r2) = awidth r1 + awidth r2" | |
|
321 "awidth (Times r1 r2) = awidth r1 + awidth r2" | |
|
322 "awidth (Star r1) = awidth r1" |
|
323 |
|
324 lemma card_Timess_pderivs_lang_le: |
|
325 "card (Timess (pderivs_lang A r) s) \<le> card (pderivs_lang A r)" |
|
326 by (metis card_image_le finite_pderivs_lang image_eq_UN) |
|
327 |
|
328 lemma card_pderivs_lang_UNIV1_le_awidth: "card (pderivs_lang UNIV1 r) \<le> awidth r" |
|
329 proof (induction r) |
|
330 case (Plus r1 r2) |
|
331 have "card (pderivs_lang UNIV1 (Plus r1 r2)) = card (pderivs_lang UNIV1 r1 \<union> pderivs_lang UNIV1 r2)" by simp |
|
332 also have "\<dots> \<le> card (pderivs_lang UNIV1 r1) + card (pderivs_lang UNIV1 r2)" |
|
333 by(simp add: card_Un_le) |
|
334 also have "\<dots> \<le> awidth (Plus r1 r2)" using Plus.IH by simp |
|
335 finally show ?case . |
|
336 next |
|
337 case (Times r1 r2) |
|
338 have "card (pderivs_lang UNIV1 (Times r1 r2)) \<le> card (Timess (pderivs_lang UNIV1 r1) r2 \<union> pderivs_lang UNIV1 r2)" |
|
339 by (simp add: card_mono finite_pderivs_lang pderivs_lang_Times) |
|
340 also have "\<dots> \<le> card (Timess (pderivs_lang UNIV1 r1) r2) + card (pderivs_lang UNIV1 r2)" |
|
341 by (simp add: card_Un_le) |
|
342 also have "\<dots> \<le> card (pderivs_lang UNIV1 r1) + card (pderivs_lang UNIV1 r2)" |
|
343 by (simp add: card_Timess_pderivs_lang_le) |
|
344 also have "\<dots> \<le> awidth (Times r1 r2)" using Times.IH by simp |
|
345 finally show ?case . |
|
346 next |
|
347 case (Star r) |
|
348 have "card (pderivs_lang UNIV1 (Star r)) \<le> card (Timess (pderivs_lang UNIV1 r) (Star r))" |
|
349 by (simp add: card_mono finite_pderivs_lang pderivs_lang_Star) |
|
350 also have "\<dots> \<le> card (pderivs_lang UNIV1 r)" by (rule card_Timess_pderivs_lang_le) |
|
351 also have "\<dots> \<le> awidth (Star r)" by (simp add: Star.IH) |
|
352 finally show ?case . |
|
353 qed (auto) |
|
354 |
|
355 text{* Antimirov's Theorem 3.4: *} |
|
356 theorem card_pderivs_lang_UNIV_le_awidth: "card (pderivs_lang UNIV r) \<le> awidth r + 1" |
|
357 proof - |
|
358 have "card (insert r (pderivs_lang UNIV1 r)) \<le> Suc (card (pderivs_lang UNIV1 r))" |
|
359 by(auto simp: card_insert_if[OF finite_pderivs_lang_UNIV1]) |
|
360 also have "\<dots> \<le> Suc (awidth r)" by(simp add: card_pderivs_lang_UNIV1_le_awidth) |
|
361 finally show ?thesis by(simp add: pderivs_lang_UNIV) |
|
362 qed |
|
363 |
|
364 text{* Antimirov's Corollary 3.5: *} |
|
365 corollary card_pderivs_lang_le_awidth: "card (pderivs_lang A r) \<le> awidth r + 1" |
|
366 by(rule order_trans[OF |
|
367 card_mono[OF finite_pderivs_lang_UNIV pderivs_lang_subset[OF subset_UNIV]] |
|
368 card_pderivs_lang_UNIV_le_awidth]) |
|
369 |
|
370 end |