4 %% Created for CS TAN at 2022-11-20 17:26:32 +0000 |
4 %% Created for CS TAN at 2022-11-20 17:26:32 +0000 |
5 |
5 |
6 |
6 |
7 %% Saved with string encoding Unicode (UTF-8) |
7 %% Saved with string encoding Unicode (UTF-8) |
8 @inproceedings{Doczkal2013, |
8 @inproceedings{Doczkal2013, |
9 author = {Doczkal, Christian and Kaiser, Jan-Oliver and Smolka, Gert}, |
9 author = {C.~Doczkaland J.O.~Kaiser and G.~Smolka}, |
10 title = {A Constructive Theory of Regular Languages in Coq}, |
10 title = {A Constructive Theory of Regular Languages in Coq}, |
11 year = {2013}, |
11 year = {2013}, |
12 isbn = {9783319035444}, |
12 isbn = {9783319035444}, |
13 publisher = {Springer-Verlag}, |
13 publisher = {Springer-Verlag}, |
14 address = {Berlin, Heidelberg}, |
14 address = {Berlin, Heidelberg}, |
19 pages = {82–97}, |
19 pages = {82–97}, |
20 numpages = {16}, |
20 numpages = {16}, |
21 keywords = {finite automata, regular expressions, Myhill-Nerode, Ssreflect, Coq, regular languages} |
21 keywords = {finite automata, regular expressions, Myhill-Nerode, Ssreflect, Coq, regular languages} |
22 } |
22 } |
23 |
23 |
24 |
24 @inproceedings{Lammich2012, |
|
25 author = {P.~Lammichand T.~Tuerk}, |
|
26 year = {2012}, |
|
27 month = {08}, |
|
28 pages = {166-182}, |
|
29 title = {Applying Data Refinement for Monadic Programs to Hopcroft's Algorithm}, |
|
30 isbn = {978-3-642-32346-1}, |
|
31 doi = {10.1007/978-3-642-32347-8_12} |
|
32 } |
25 |
33 |
26 @article{Krauss2012, |
34 @article{Krauss2012, |
27 author={Alexander Krauss and Tobias Nipkow}, |
35 author={A.~Krauss and T.~Nipkow}, |
28 title={Proof Pearl: Regular Expression Equivalence and Relation Algebra}, |
36 title={Proof Pearl: Regular Expression Equivalence and Relation Algebra}, |
29 journal={J. Automated Reasoning},volume=49,pages={95--106},year=2012,note={published online March 2011}} |
37 journal={J. Automated Reasoning},volume=49,pages={95--106},year=2012,note={published online March 2011}} |
30 |
38 |
31 |
39 |
32 |
40 |
36 journal={Automata studies}, |
44 journal={Automata studies}, |
37 volume={34}, |
45 volume={34}, |
38 pages={3--41}, |
46 pages={3--41}, |
39 year={1956}, |
47 year={1956}, |
40 publisher={Princeton, NJ} |
48 publisher={Princeton, NJ} |
41 } |
|
42 |
|
43 @inproceedings{Sailesh2006, |
|
44 author = {K.~Sailesh and D.~Sarang and Y.~Fang and C.~Patrick and T.~Jonathan}, |
|
45 title = {Algorithms to Accelerate Multiple Regular Expressions Matching for Deep Packet Inspection}, |
|
46 year = {2006}, |
|
47 isbn = {1595933085}, |
|
48 publisher = {Association for Computing Machinery}, |
|
49 address = {New York, NY, USA}, |
|
50 url = {https://doi.org/10.1145/1159913.1159952}, |
|
51 doi = {10.1145/1159913.1159952}, |
|
52 abstract = {There is a growing demand for network devices capable of examining the content of data packets in order to improve network security and provide application-specific services. Most high performance systems that perform deep packet inspection implement simple string matching algorithms to match packets against a large, but finite set of strings. owever, there is growing interest in the use of regular expression-based pattern matching, since regular expressions offer superior expressive power and flexibility. Deterministic finite automata (DFA) representations are typically used to implement regular expressions. However, DFA representations of regular expression sets arising in network applications require large amounts of memory, limiting their practical application.In this paper, we introduce a new representation for regular expressions, called the Delayed Input DFA (D2FA), which substantially reduces space equirements as compared to a DFA. A D2FA is constructed by transforming a DFA via incrementally replacing several transitions of the automaton with a single default transition. Our approach dramatically reduces the number of distinct transitions between states. For a collection of regular expressions drawn from current commercial and academic systems, a D2FA representation reduces transitions by more than 95%. Given the substantially reduced space equirements, we describe an efficient architecture that can perform deep packet inspection at multi-gigabit rates. Our architecture uses multiple on-chip memories in such a way that each remains uniformly occupied and accessed over a short duration, thus effectively distributing the load and enabling high throughput. Our architecture can provide ostffective packet content scanning at OC-192 rates with memory requirements that are consistent with current ASIC technology.}, |
|
53 booktitle = {Proceedings of the 2006 Conference on Applications, Technologies, Architectures, and Protocols for Computer Communications}, |
|
54 pages = {339–350}, |
|
55 numpages = {12}, |
|
56 keywords = {deep packet inspection, regular expressions, DFA}, |
|
57 location = {Pisa, Italy}, |
|
58 series = {SIGCOMM '06} |
|
59 } |
49 } |
60 |
50 |
61 |
51 |
62 |
52 |
63 @article{Murugesan2014, |
53 @article{Murugesan2014, |
100 bdsk-url-1 = {https://doi.org/10.4204/EPTCS.151.7}} |
90 bdsk-url-1 = {https://doi.org/10.4204/EPTCS.151.7}} |
101 |
91 |
102 @inproceedings{Berglund18, |
92 @inproceedings{Berglund18, |
103 abstract = {Whereas Perl-compatible regular expression matchers typically exhibit some variation of leftmost-greedy semantics, those conforming to the posix standard are prescribed leftmost-longest semantics. However, the posix standard leaves some room for interpretation, and Fowler and Kuklewicz have done experimental work to confirm differences between various posix matchers. The Boost library has an interesting take on the posix standard, where it maximises the leftmost match not with respect to subexpressions of the regular expression pattern, but rather, with respect to capturing groups. In our work, we provide the first formalisation of Boost semantics, and we analyse the complexity of regular expression matching when using Boost semantics.}, |
93 abstract = {Whereas Perl-compatible regular expression matchers typically exhibit some variation of leftmost-greedy semantics, those conforming to the posix standard are prescribed leftmost-longest semantics. However, the posix standard leaves some room for interpretation, and Fowler and Kuklewicz have done experimental work to confirm differences between various posix matchers. The Boost library has an interesting take on the posix standard, where it maximises the leftmost match not with respect to subexpressions of the regular expression pattern, but rather, with respect to capturing groups. In our work, we provide the first formalisation of Boost semantics, and we analyse the complexity of regular expression matching when using Boost semantics.}, |
104 address = {Cham}, |
94 address = {Cham}, |
105 author = {M.~Berglund and Bester, Willem and van der Merwe, Brink}, |
95 author = {M.~Berglund and W.~Bester and van der M.~Brink}, |
106 booktitle = {Theoretical Aspects of Computing -- ICTAC 2018}, |
96 booktitle = {Theoretical Aspects of Computing -- ICTAC 2018}, |
107 editor = {Fischer, Bernd and Uustalu, Tarmo}, |
97 editor = {Fischer, Bernd and Uustalu, Tarmo}, |
108 isbn = {978-3-030-02508-3}, |
98 isbn = {978-3-030-02508-3}, |
109 pages = {99--115}, |
99 pages = {99--115}, |
110 publisher = {Springer International Publishing}, |
100 publisher = {Springer International Publishing}, |
111 title = {Formalising Boost POSIX Regular Expression Matching}, |
101 title = {Formalising Boost POSIX Regular Expression Matching}, |
112 year = {2018}} |
102 year = {2018}} |
113 |
103 |
114 @inproceedings{Chen12, |
104 @inproceedings{Chen12, |
115 author = {Chen, Haiming and Yu, Sheng}, |
105 author = {H.~Chenand S.~Yu}, |
116 doi = {10.1007/978-3-642-27654-5_27}, |
106 doi = {10.1007/978-3-642-27654-5_27}, |
117 month = {01}, |
107 month = {01}, |
118 pages = {343-356}, |
108 pages = {343-356}, |
119 title = {Derivatives of Regular Expressions and an Application}, |
109 title = {Derivatives of Regular Expressions and an Application}, |
120 volume = {7160}, |
110 volume = {7160}, |
121 year = {2012}, |
111 year = {2012}, |
122 bdsk-url-1 = {https://doi.org/10.1007/978-3-642-27654-5_27}} |
112 bdsk-url-1 = {https://doi.org/10.1007/978-3-642-27654-5_27}} |
123 |
113 |
124 @article{Takayuki2019, |
114 @article{Takayuki2019, |
125 author = {Takayuki Miyazaki and Yasuhiko Minamide}, |
115 author = {T.~Miyazaki and Y.~Minamide}, |
126 doi = {10.2197/ipsjjip.27.422}, |
116 doi = {10.2197/ipsjjip.27.422}, |
127 journal = {Journal of Information Processing}, |
117 journal = {Journal of Information Processing}, |
128 pages = {422-430}, |
118 pages = {422-430}, |
129 title = {Derivatives of Regular Expressions with Lookahead}, |
119 title = {Derivatives of Regular Expressions with Lookahead}, |
130 volume = {27}, |
120 volume = {27}, |
164 year = {2012}, |
154 year = {2012}, |
165 bdsk-url-1 = {https://doi.org/10.1007/978-3-642-31653-1_8}} |
155 bdsk-url-1 = {https://doi.org/10.1007/978-3-642-31653-1_8}} |
166 |
156 |
167 @article{BERGLUND2022, |
157 @article{BERGLUND2022, |
168 abstract = {Most modern regular expression matching libraries (one of the rare exceptions being Google's RE2) allow backreferences, operations which bind a substring to a variable, allowing it to be matched again verbatim. However, both real-world implementations and definitions in the literature use different syntactic restrictions and have differences in the semantics of the matching of backreferences. Our aim is to compare these various flavors by considering the classes of formal languages that each can describe, establishing, as a result, a hierarchy of language classes. Beyond the hierarchy itself, some complexity results are given, and as part of the effort on comparing language classes new pumping lemmas are established, old classes are extended to new ones, and several incidental results on the nature of these language classes are given.}, |
158 abstract = {Most modern regular expression matching libraries (one of the rare exceptions being Google's RE2) allow backreferences, operations which bind a substring to a variable, allowing it to be matched again verbatim. However, both real-world implementations and definitions in the literature use different syntactic restrictions and have differences in the semantics of the matching of backreferences. Our aim is to compare these various flavors by considering the classes of formal languages that each can describe, establishing, as a result, a hierarchy of language classes. Beyond the hierarchy itself, some complexity results are given, and as part of the effort on comparing language classes new pumping lemmas are established, old classes are extended to new ones, and several incidental results on the nature of these language classes are given.}, |
169 author = {Martin Berglund and Brink {van der Merwe}}, |
159 author = {M.~Berglund and van der M.~Brink }, |
170 doi = {https://doi.org/10.1016/j.tcs.2022.10.041}, |
160 doi = {https://doi.org/10.1016/j.tcs.2022.10.041}, |
171 issn = {0304-3975}, |
161 issn = {0304-3975}, |
172 journal = {Theoretical Computer Science}, |
162 journal = {Theoretical Computer Science}, |
173 keywords = {Regular expressions, Backreferences}, |
163 keywords = {Regular expressions, Backreferences}, |
174 title = {Re-examining regular expressions with backreferences}, |
164 title = {Re-examining regular expressions with backreferences}, |
177 bdsk-url-1 = {https://www.sciencedirect.com/science/article/pii/S0304397522006570}, |
167 bdsk-url-1 = {https://www.sciencedirect.com/science/article/pii/S0304397522006570}, |
178 bdsk-url-2 = {https://doi.org/10.1016/j.tcs.2022.10.041}} |
168 bdsk-url-2 = {https://doi.org/10.1016/j.tcs.2022.10.041}} |
179 |
169 |
180 @article{FREYDENBERGER20191, |
170 @article{FREYDENBERGER20191, |
181 abstract = {Most modern libraries for regular expression matching allow back-references (i.e., repetition operators) that substantially increase expressive power, but also lead to intractability. In order to find a better balance between expressiveness and tractability, we combine these with the notion of determinism for regular expressions used in XML DTDs and XML Schema. This includes the definition of a suitable automaton model, and a generalization of the Glushkov construction. We demonstrate that, compared to their non-deterministic superclass, these deterministic regular expressions with back-references have desirable algorithmic properties (i.e., efficiently solvable membership problem and some decidable problems in static analysis), while, at the same time, their expressive power exceeds that of deterministic regular expressions without back-references.}, |
171 abstract = {Most modern libraries for regular expression matching allow back-references (i.e., repetition operators) that substantially increase expressive power, but also lead to intractability. In order to find a better balance between expressiveness and tractability, we combine these with the notion of determinism for regular expressions used in XML DTDs and XML Schema. This includes the definition of a suitable automaton model, and a generalization of the Glushkov construction. We demonstrate that, compared to their non-deterministic superclass, these deterministic regular expressions with back-references have desirable algorithmic properties (i.e., efficiently solvable membership problem and some decidable problems in static analysis), while, at the same time, their expressive power exceeds that of deterministic regular expressions without back-references.}, |
182 author = {Dominik D. Freydenberger and Markus L. Schmid}, |
172 author = {D.D.~Freydenberger and M.L.~Schmid}, |
183 doi = {https://doi.org/10.1016/j.jcss.2019.04.001}, |
173 doi = {https://doi.org/10.1016/j.jcss.2019.04.001}, |
184 issn = {0022-0000}, |
174 issn = {0022-0000}, |
185 journal = {Journal of Computer and System Sciences}, |
175 journal = {Journal of Computer and System Sciences}, |
186 keywords = {Deterministic regular expression, Regex, Glushkov automaton}, |
176 keywords = {Deterministic regular expression, Regex, Glushkov automaton}, |
187 pages = {1-39}, |
177 pages = {1-39}, |
193 bdsk-url-2 = {https://doi.org/10.1016/j.jcss.2019.04.001}} |
183 bdsk-url-2 = {https://doi.org/10.1016/j.jcss.2019.04.001}} |
194 |
184 |
195 @inproceedings{Frey2013, |
185 @inproceedings{Frey2013, |
196 address = {Dagstuhl, Germany}, |
186 address = {Dagstuhl, Germany}, |
197 annote = {Keywords: extended regular expressions, regex, decidability, non-recursive tradeoffs}, |
187 annote = {Keywords: extended regular expressions, regex, decidability, non-recursive tradeoffs}, |
198 author = {Dominik D. Freydenberger}, |
188 author = {D.D.~Freydenberger}, |
199 booktitle = {28th International Symposium on Theoretical Aspects of Computer Science (STACS 2011)}, |
189 booktitle = {28th International Symposium on Theoretical Aspects of Computer Science (STACS 2011)}, |
200 doi = {10.4230/LIPIcs.STACS.2011.507}, |
190 doi = {10.4230/LIPIcs.STACS.2011.507}, |
201 editor = {Thomas Schwentick and Christoph D{\"u}rr}, |
191 editor = {Thomas Schwentick and Christoph D{\"u}rr}, |
202 isbn = {978-3-939897-25-5}, |
192 isbn = {978-3-939897-25-5}, |
203 issn = {1868-8969}, |
193 issn = {1868-8969}, |
233 year = {2009}, |
223 year = {2009}, |
234 bdsk-url-1 = {https://doi.org/10.1007/s00236-009-0090-y}} |
224 bdsk-url-1 = {https://doi.org/10.1007/s00236-009-0090-y}} |
235 |
225 |
236 @article{CAMPEANU2009Intersect, |
226 @article{CAMPEANU2009Intersect, |
237 abstract = {In this paper we revisit the semantics of extended regular expressions (regex), defined succinctly in the 90s [A.V. Aho, Algorithms for finding patterns in strings, in: Jan van Leeuwen (Ed.), Handbook of Theoretical Computer Science, in: Algorithms and Complexity, vol. A, Elsevier and MIT Press, 1990, pp. 255--300] and rigorously in 2003 by C{\^a}mpeanu, Salomaa and Yu [C. C{\^a}mpeanu, K. Salomaa, S. Yu, A formal study of practical regular expressions, IJFCS 14 (6) (2003) 1007--1018], when the authors reported an open problem, namely whether regex languages are closed under the intersection with regular languages. We give a positive answer; and for doing so, we propose a new class of machines --- regex automata systems (RAS) --- which are equivalent to regex. Among others, these machines provide a consistent and convenient method of implementing regex in practice. We also prove, as a consequence of this closure property, that several languages, such as the mirror language, the language of palindromes, and the language of balanced words are not regex languages.}, |
227 abstract = {In this paper we revisit the semantics of extended regular expressions (regex), defined succinctly in the 90s [A.V. Aho, Algorithms for finding patterns in strings, in: Jan van Leeuwen (Ed.), Handbook of Theoretical Computer Science, in: Algorithms and Complexity, vol. A, Elsevier and MIT Press, 1990, pp. 255--300] and rigorously in 2003 by C{\^a}mpeanu, Salomaa and Yu [C. C{\^a}mpeanu, K. Salomaa, S. Yu, A formal study of practical regular expressions, IJFCS 14 (6) (2003) 1007--1018], when the authors reported an open problem, namely whether regex languages are closed under the intersection with regular languages. We give a positive answer; and for doing so, we propose a new class of machines --- regex automata systems (RAS) --- which are equivalent to regex. Among others, these machines provide a consistent and convenient method of implementing regex in practice. We also prove, as a consequence of this closure property, that several languages, such as the mirror language, the language of palindromes, and the language of balanced words are not regex languages.}, |
238 author = {Cezar C{\^a}mpeanu and Nicolae Santean}, |
228 author = {C.~C{\^a}mpeanu and N.~Santean}, |
239 doi = {https://doi.org/10.1016/j.tcs.2009.02.022}, |
229 doi = {https://doi.org/10.1016/j.tcs.2009.02.022}, |
240 issn = {0304-3975}, |
230 issn = {0304-3975}, |
241 journal = {Theoretical Computer Science}, |
231 journal = {Theoretical Computer Science}, |
242 keywords = {Extended regular expression, Regex automata system, Regex}, |
232 keywords = {Extended regular expression, Regex automata system, Regex}, |
243 note = {Formal Languages and Applications: A Collection of Papers in Honor of Sheng Yu}, |
233 note = {Formal Languages and Applications: A Collection of Papers in Honor of Sheng Yu}, |
284 title = {Parsing with Derivatives: A Functional Pearl}, |
274 title = {Parsing with Derivatives: A Functional Pearl}, |
285 url = {https://doi.org/10.1145/2034773.2034801}, |
275 url = {https://doi.org/10.1145/2034773.2034801}, |
286 year = {2011}, |
276 year = {2011}, |
287 bdsk-url-1 = {https://doi.org/10.1145/2034773.2034801}} |
277 bdsk-url-1 = {https://doi.org/10.1145/2034773.2034801}} |
288 |
278 |
289 @article{10.1145/2034574.2034801, |
|
290 abstract = {We present a functional approach to parsing unrestricted context-free grammars based on Brzozowski's derivative of regular expressions. If we consider context-free grammars as recursive regular expressions, Brzozowski's equational theory extends without modification to context-free grammars (and it generalizes to parser combinators). The supporting actors in this story are three concepts familiar to functional programmers - laziness, memoization and fixed points; these allow Brzozowski's original equations to be transliterated into purely functional code in about 30 lines spread over three functions.Yet, this almost impossibly brief implementation has a drawback: its performance is sour - in both theory and practice. The culprit? Each derivative can double the size of a grammar, and with it, the cost of the next derivative.Fortunately, much of the new structure inflicted by the derivative is either dead on arrival, or it dies after the very next derivative. To eliminate it, we once again exploit laziness and memoization to transliterate an equational theory that prunes such debris into working code. Thanks to this compaction, parsing times become reasonable in practice.We equip the functional programmer with two equational theories that, when combined, make for an abbreviated understanding and implementation of a system for parsing context-free languages.}, |
|
291 address = {New York, NY, USA}, |
|
292 author = {Might, Matthew and Darais, David and Spiewak, Daniel}, |
|
293 doi = {10.1145/2034574.2034801}, |
|
294 issn = {0362-1340}, |
|
295 issue_date = {September 2011}, |
|
296 journal = {SIGPLAN Not.}, |
|
297 keywords = {formal languages, derivative, context-free grammar, regular expressions, parsing, parser combinator}, |
|
298 month = {sep}, |
|
299 number = {9}, |
|
300 numpages = {7}, |
|
301 pages = {189--195}, |
|
302 publisher = {Association for Computing Machinery}, |
|
303 title = {Parsing with Derivatives: A Functional Pearl}, |
|
304 url = {https://doi.org/10.1145/2034574.2034801}, |
|
305 volume = {46}, |
|
306 year = {2011}, |
|
307 bdsk-url-1 = {https://doi.org/10.1145/2034574.2034801}} |
|
308 |
|
309 |
279 |
310 |
280 |
311 @inproceedings{Adams2016, |
281 @inproceedings{Adams2016, |
312 title={On the complexity and performance of parsing with derivatives}, |
282 title={On the complexity and performance of parsing with derivatives}, |
313 author={Adams, Michael D and Hollenbeck, Celeste and Might, Matthew}, |
283 author={M.D.~Adams and C.~Hollenbeck and M.~MightMatthew}, |
314 booktitle={Proceedings of the 37th ACM SIGPLAN Conference on Programming Language Design and Implementation}, |
284 booktitle={Proceedings of the 37th ACM SIGPLAN Conference on Programming Language Design and Implementation}, |
315 pages={224--236}, |
285 pages={224--236}, |
316 year={2016} |
286 year={2016} |
317 } |
287 } |
318 |
288 @article{Huet1997, |
|
289 author = {G.~Huet}, |
|
290 title = {The Zipper}, |
|
291 year = {1997}, |
|
292 issue_date = {September 1997}, |
|
293 publisher = {Cambridge University Press}, |
|
294 address = {USA}, |
|
295 volume = {7}, |
|
296 number = {5}, |
|
297 issn = {0956-7968}, |
|
298 url = {https://doi.org/10.1017/S0956796897002864}, |
|
299 doi = {10.1017/S0956796897002864}, |
|
300 abstract = {Almost every programmer has faced the problem of representing a tree together with a subtree that is the focus of attention, where that focus may move left, right, up or down the tree. The Zipper is Huet's nifty name for a nifty data structure which fulfills this need. I wish I had known of it when I faced this task, because the solution I came up with was not quite so efficient or elegant as the Zipper.}, |
|
301 journal = {J. Funct. Program.}, |
|
302 month = {sep}, |
|
303 pages = {549–554}, |
|
304 numpages = {6} |
|
305 } |
319 |
306 |
320 @article{Darragh2020, |
307 @article{Darragh2020, |
321 abstract = {Parsing with Derivatives (PwD) is an elegant approach to parsing context-free grammars (CFGs). It takes the equational theory behind Brzozowski's derivative for regular expressions and augments that theory with laziness, memoization, and fixed points. The result is a simple parser for arbitrary CFGs. Although recent work improved the performance of PwD, it remains inefficient due to the algorithm repeatedly traversing some parts of the grammar. In this functional pearl, we show how to avoid this inefficiency by suspending the state of the traversal in a zipper. When subsequent derivatives are taken, we can resume the traversal from where we left off without retraversing already traversed parts of the grammar. However, the original zipper is designed for use with trees, and we want to parse CFGs. CFGs can include shared regions, cycles, and choices between alternates, which makes them incompatible with the traditional tree model for zippers. This paper develops a generalization of zippers to properly handle these additional features. Just as PwD generalized Brzozowski's derivatives from regular expressions to CFGs, we generalize Huet's zippers from trees to CFGs. Abstract The resulting parsing algorithm is concise and efficient: it takes only 31 lines of OCaml code to implement the derivative function but performs 6,500 times faster than the original PwD and 3.24 times faster than the optimized implementation of PwD.}, |
308 abstract = {Parsing with Derivatives (PwD) is an elegant approach to parsing context-free grammars (CFGs). It takes the equational theory behind Brzozowski's derivative for regular expressions and augments that theory with laziness, memoization, and fixed points. The result is a simple parser for arbitrary CFGs. Although recent work improved the performance of PwD, it remains inefficient due to the algorithm repeatedly traversing some parts of the grammar. In this functional pearl, we show how to avoid this inefficiency by suspending the state of the traversal in a zipper. When subsequent derivatives are taken, we can resume the traversal from where we left off without retraversing already traversed parts of the grammar. However, the original zipper is designed for use with trees, and we want to parse CFGs. CFGs can include shared regions, cycles, and choices between alternates, which makes them incompatible with the traditional tree model for zippers. This paper develops a generalization of zippers to properly handle these additional features. Just as PwD generalized Brzozowski's derivatives from regular expressions to CFGs, we generalize Huet's zippers from trees to CFGs. Abstract The resulting parsing algorithm is concise and efficient: it takes only 31 lines of OCaml code to implement the derivative function but performs 6,500 times faster than the original PwD and 3.24 times faster than the optimized implementation of PwD.}, |
322 address = {New York, NY, USA}, |
309 address = {New York, NY, USA}, |
323 articleno = {108}, |
310 articleno = {108}, |
324 author = {Darragh, Pierce and Adams, Michael D.}, |
311 author = {P.~Darragh and M.D.~Adams}, |
325 doi = {10.1145/3408990}, |
312 doi = {10.1145/3408990}, |
326 issue_date = {August 2020}, |
313 issue_date = {August 2020}, |
327 journal = {Proc. ACM Program. Lang.}, |
314 journal = {Proc. ACM Program. Lang.}, |
328 keywords = {Derivatives, Zippers, Parsing with Derivatives, Parsing}, |
315 keywords = {Derivatives, Zippers, Parsing with Derivatives, Parsing}, |
329 month = {aug}, |
316 month = {aug}, |
341 Within the framework of the thesis, I present parsing algorithms that are based on the concept of Brzozowski's derivatives. Derivative-based parsing algorithms present several advantages: they are elegant, amenable to formal reasoning, and easy to implement. Unfortunately, the performance of these algorithms in practice is often not competitive with other approaches. In this thesis, I show a general technique inspired by Huet's Zipper to greatly enhance the performance of derivative-based algorithms, and I do so without compromising their elegance, amenability to formal reasoning, or ease of implementation. |
328 Within the framework of the thesis, I present parsing algorithms that are based on the concept of Brzozowski's derivatives. Derivative-based parsing algorithms present several advantages: they are elegant, amenable to formal reasoning, and easy to implement. Unfortunately, the performance of these algorithms in practice is often not competitive with other approaches. In this thesis, I show a general technique inspired by Huet's Zipper to greatly enhance the performance of derivative-based algorithms, and I do so without compromising their elegance, amenability to formal reasoning, or ease of implementation. |
342 First, I present a technique for building efficient tokenisers that is based on Brzozowski's derivatives and Huet's zipper and that does not require the usual burdensome explicit conversion to automata. I prove the technique is correct in Coq and present SILEX, a Scala lexing library based on the technique. I demonstrate that the approach is competitive with state-of-the-art solutions. |
329 First, I present a technique for building efficient tokenisers that is based on Brzozowski's derivatives and Huet's zipper and that does not require the usual burdensome explicit conversion to automata. I prove the technique is correct in Coq and present SILEX, a Scala lexing library based on the technique. I demonstrate that the approach is competitive with state-of-the-art solutions. |
343 Then, I present a characterisation of LL(1) languages based on the concept of should-not-follow sets. I present an algorithm for parsing LL(1) languages with derivatives and zippers. I show a formal proof of the algorithm's correctness and prove its worst-case linear-time complexity. I show how the LL(1) parsing with derivatives and zippers algorithm corresponds to the traditional LL(1) parsing algorithm. |
330 Then, I present a characterisation of LL(1) languages based on the concept of should-not-follow sets. I present an algorithm for parsing LL(1) languages with derivatives and zippers. I show a formal proof of the algorithm's correctness and prove its worst-case linear-time complexity. I show how the LL(1) parsing with derivatives and zippers algorithm corresponds to the traditional LL(1) parsing algorithm. |
344 I then present SCALL1ON, a Scala parsing combinators library for LL(1) languages that incorporates the LL(1) parsing with derivatives and zippers algorithm. I present an expressive and familiar combinator-based interface for describing LL(1) languages. I present techniques that help precisely locate LL(1) conflicts in user code. I discuss several advantages of the parsing with derivatives approach within the context of a parsing library. I also present SCALL1ON's enumeration and pretty-printing features and discuss their implementation. Through a series of benchmarks, I demonstrate the good performance and practicality of the approach. Finally, I present how to adapt the LL(1) parsing with derivatives and zippers algorithm to support arbitrary context-free languages. I show how the adapted algorithm corresponds to general parsing algorithms, such as Earley's parsing algorithm.}, |
331 I then present SCALL1ON, a Scala parsing combinators library for LL(1) languages that incorporates the LL(1) parsing with derivatives and zippers algorithm. I present an expressive and familiar combinator-based interface for describing LL(1) languages. I present techniques that help precisely locate LL(1) conflicts in user code. I discuss several advantages of the parsing with derivatives approach within the context of a parsing library. I also present SCALL1ON's enumeration and pretty-printing features and discuss their implementation. Through a series of benchmarks, I demonstrate the good performance and practicality of the approach. Finally, I present how to adapt the LL(1) parsing with derivatives and zippers algorithm to support arbitrary context-free languages. I show how the adapted algorithm corresponds to general parsing algorithms, such as Earley's parsing algorithm.}, |
345 address = {Lausanne}, |
332 address = {Lausanne}, |
346 author = {Edelmann, Romain}, |
333 author = {R.~Edelmann}, |
347 doi = {10.5075/epfl-thesis-7357}, |
334 doi = {10.5075/epfl-thesis-7357}, |
348 institution = {IINFCOM}, |
335 institution = {IINFCOM}, |
349 pages = {246}, |
336 pages = {246}, |
350 publisher = {EPFL}, |
337 publisher = {EPFL}, |
351 title = {Efficient Parsing with Derivatives and Zippers}, |
338 title = {Efficient Parsing with Derivatives and Zippers}, |
352 url = {http://infoscience.epfl.ch/record/287059}, |
339 url = {http://infoscience.epfl.ch/record/287059}, |
353 year = {2021}, |
340 year = {2021}, |
354 bdsk-url-1 = {http://infoscience.epfl.ch/record/287059}, |
341 bdsk-url-1 = {http://infoscience.epfl.ch/record/287059}, |
355 bdsk-url-2 = {https://doi.org/10.5075/epfl-thesis-7357}} |
342 bdsk-url-2 = {https://doi.org/10.5075/epfl-thesis-7357}} |
356 |
343 |
|
344 |
|
345 |
|
346 @inproceedings{Paulson2015,, |
|
347 author = {L.~Paulson}, |
|
348 year = {2015}, |
|
349 month = {05}, |
|
350 pages = {}, |
|
351 title = {A Formalisation of Finite Automata Using Hereditarily Finite Sets}, |
|
352 isbn = {978-3-319-21400-9}, |
|
353 doi = {10.1007/978-3-319-21401-6_15} |
|
354 } |
|
355 |
357 @inproceedings{Zippy2020, |
356 @inproceedings{Zippy2020, |
358 abstract = {In this paper, we present an efficient, functional, and formally verified parsing algorithm for LL(1) context-free expressions based on the concept of derivatives of formal languages. Parsing with derivatives is an elegant parsing technique, which, in the general case, suffers from cubic worst-case time complexity and slow performance in practice. We specialise the parsing with derivatives algorithm to LL(1) context-free expressions, where alternatives can be chosen given a single token of lookahead. We formalise the notion of LL(1) expressions and show how to efficiently check the LL(1) property. Next, we present a novel linear-time parsing with derivatives algorithm for LL(1) expressions operating on a zipper-inspired data structure. We prove the algorithm correct in Coq and present an implementation as a part of Scallion, a parser combinators framework in Scala with enumeration and pretty printing capabilities.}, |
357 abstract = {In this paper, we present an efficient, functional, and formally verified parsing algorithm for LL(1) context-free expressions based on the concept of derivatives of formal languages. Parsing with derivatives is an elegant parsing technique, which, in the general case, suffers from cubic worst-case time complexity and slow performance in practice. We specialise the parsing with derivatives algorithm to LL(1) context-free expressions, where alternatives can be chosen given a single token of lookahead. We formalise the notion of LL(1) expressions and show how to efficiently check the LL(1) property. Next, we present a novel linear-time parsing with derivatives algorithm for LL(1) expressions operating on a zipper-inspired data structure. We prove the algorithm correct in Coq and present an implementation as a part of Scallion, a parser combinators framework in Scala with enumeration and pretty printing capabilities.}, |
359 address = {New York, NY, USA}, |
358 address = {New York, NY, USA}, |
360 author = {Edelmann, Romain and Hamza, Jad and Kun\v{c}ak, Viktor}, |
359 author = {R.~Edelmannand H.~Jad and K.~Viktor}, |
361 booktitle = {Proceedings of the 41st ACM SIGPLAN Conference on Programming Language Design and Implementation}, |
360 booktitle = {Proceedings of the 41st ACM SIGPLAN Conference on Programming Language Design and Implementation}, |
362 doi = {10.1145/3385412.3385992}, |
361 doi = {10.1145/3385412.3385992}, |
363 isbn = {9781450376136}, |
362 isbn = {9781450376136}, |
364 keywords = {Parsing, Zipper, Formal proof, LL(1), Derivatives}, |
363 keywords = {Parsing, Zipper, Formal proof, LL(1), Derivatives}, |
365 location = {London, UK}, |
364 location = {London, UK}, |
390 series = {LISA '99}, |
389 series = {LISA '99}, |
391 title = {Snort - Lightweight Intrusion Detection for Networks}, |
390 title = {Snort - Lightweight Intrusion Detection for Networks}, |
392 year = {1999}} |
391 year = {1999}} |
393 |
392 |
394 |
393 |
|
394 @article{Reps1998, |
|
395 author = {T.~Reps}, |
|
396 title = {“Maximal-Munch” Tokenization in Linear Time}, |
|
397 year = {1998}, |
|
398 issue_date = {March 1998}, |
|
399 publisher = {Association for Computing Machinery}, |
|
400 address = {New York, NY, USA}, |
|
401 volume = {20}, |
|
402 number = {2}, |
|
403 issn = {0164-0925}, |
|
404 url = {https://doi.org/10.1145/276393.276394}, |
|
405 doi = {10.1145/276393.276394}, |
|
406 abstract = {The lexical-analysis (or scanning) phase of a compiler attempts to partition an input string into a sequence of tokens. The convention in most languages is that the input is scanned left to right, and each token identified is a “maximal munch” of the remaining input—the longest prefix of the remaining input that is a token of the language. Although most of the standard compiler textbooks present a way to perform maximal-munch tokenization, the algorithm they describe is one that, for certain sets of token definitions, can cause the scanner to exhibit quadratic behavior in the worst case. In the article, we show that maximal-munch tokenization can always be performed in time linear in the size of the input.}, |
|
407 journal = {ACM Trans. Program. Lang. Syst.}, |
|
408 month = {mar}, |
|
409 pages = {259–273}, |
|
410 numpages = {15}, |
|
411 keywords = {backtracking, tokenization, tabulation, dynamic programming, memoization} |
|
412 } |
395 |
413 |
396 @inproceedings{Bro, |
414 @inproceedings{Bro, |
397 author = {V.~Paxson}, |
415 author = {V.~Paxson}, |
398 title = {Bro: A System for Detecting Network Intruders in Real-Time}, |
416 title = {Bro: A System for Detecting Network Intruders in Real-Time}, |
399 year = {1998}, |
417 year = {1998}, |
405 numpages = {1}, |
423 numpages = {1}, |
406 location = {San Antonio, Texas}, |
424 location = {San Antonio, Texas}, |
407 series = {SSYM'98} |
425 series = {SSYM'98} |
408 } |
426 } |
409 |
427 |
410 |
428 @INPROCEEDINGS{Sidhu2001, |
|
429 author={Sidhu, R. and Prasanna, V.K.}, |
|
430 booktitle={The 9th Annual IEEE Symposium on Field-Programmable Custom Computing Machines (FCCM'01)}, |
|
431 title={Fast Regular Expression Matching Using FPGAs}, |
|
432 year={2001}, |
|
433 volume={}, |
|
434 number={}, |
|
435 pages={227-238}, |
|
436 doi={} |
|
437 } |
|
438 @article{Turonova2020, |
|
439 author = {L.~Turo\v{n}ov\'{a} and L.~Hol\'{\i}k and O.~Leng\'{a}l and O.~Saarikivi and M.~Veanes and T.~Vojnar }, |
|
440 title = {Regex Matching with Counting-Set Automata}, |
|
441 year = {2020}, |
|
442 issue_date = {November 2020}, |
|
443 publisher = {Association for Computing Machinery}, |
|
444 address = {New York, NY, USA}, |
|
445 volume = {4}, |
|
446 number = {OOPSLA}, |
|
447 url = {https://doi.org/10.1145/3428286}, |
|
448 doi = {10.1145/3428286}, |
|
449 abstract = {We propose a solution to the problem of efficient matching regular expressions (regexes) with bounded repetition, such as (ab){1,100}, using deterministic automata. For this, we introduce novel counting-set automata (CsAs), automata with registers that can hold sets of bounded integers and can be manipulated by a limited portfolio of constant-time operations. We present an algorithm that compiles a large sub-class of regexes to deterministic CsAs. This includes (1) a novel Antimirov-style translation of regexes with counting to counting automata (CAs), nondeterministic automata with bounded counters, and (2) our main technical contribution, a determinization of CAs that outputs CsAs. The main advantage of this workflow is that the size of the produced CsAs does not depend on the repetition bounds used in the regex (while the size of the DFA is exponential to them). Our experimental results confirm that deterministic CsAs produced from practical regexes with repetition are indeed vastly smaller than the corresponding DFAs. More importantly, our prototype matcher based on CsA simulation handles practical regexes with repetition regardless of sizes of counter bounds. It easily copes with regexes with repetition where state-of-the-art matchers struggle.}, |
|
450 journal = {Proc. ACM Program. Lang.}, |
|
451 month = {nov}, |
|
452 articleno = {218}, |
|
453 numpages = {30}, |
|
454 keywords = {counting-set automata, counting automata, bounded repetition, ReDos, Antimirov's derivatives, regular expression matching, determinization} |
|
455 } |
|
456 |
|
457 @inproceedings{Kumar2006, |
|
458 author = {S.~Kumar and S.~Dharmapurikar and F.~Yu and P.~Crowley and J.~Turner}, |
|
459 title = {Algorithms to Accelerate Multiple Regular Expressions Matching for Deep Packet Inspection}, |
|
460 year = {2006}, |
|
461 isbn = {1595933085}, |
|
462 publisher = {Association for Computing Machinery}, |
|
463 address = {New York, NY, USA}, |
|
464 url = {https://doi.org/10.1145/1159913.1159952}, |
|
465 doi = {10.1145/1159913.1159952}, |
|
466 abstract = {There is a growing demand for network devices capable of examining the content of data packets in order to improve network security and provide application-specific services. Most high performance systems that perform deep packet inspection implement simple string matching algorithms to match packets against a large, but finite set of strings. owever, there is growing interest in the use of regular expression-based pattern matching, since regular expressions offer superior expressive power and flexibility. Deterministic finite automata (DFA) representations are typically used to implement regular expressions. However, DFA representations of regular expression sets arising in network applications require large amounts of memory, limiting their practical application.In this paper, we introduce a new representation for regular expressions, called the Delayed Input DFA (D2FA), which substantially reduces space equirements as compared to a DFA. A D2FA is constructed by transforming a DFA via incrementally replacing several transitions of the automaton with a single default transition. Our approach dramatically reduces the number of distinct transitions between states. For a collection of regular expressions drawn from current commercial and academic systems, a D2FA representation reduces transitions by more than 95%. Given the substantially reduced space equirements, we describe an efficient architecture that can perform deep packet inspection at multi-gigabit rates. Our architecture uses multiple on-chip memories in such a way that each remains uniformly occupied and accessed over a short duration, thus effectively distributing the load and enabling high throughput. Our architecture can provide ostffective packet content scanning at OC-192 rates with memory requirements that are consistent with current ASIC technology.}, |
|
467 booktitle = {Proceedings of the 2006 Conference on Applications, Technologies, Architectures, and Protocols for Computer Communications}, |
|
468 pages = {339–350}, |
|
469 numpages = {12}, |
|
470 keywords = {regular expressions, DFA, deep packet inspection}, |
|
471 location = {Pisa, Italy}, |
|
472 series = {SIGCOMM '06} |
|
473 } |
|
474 |
|
475 |
|
476 |
|
477 |
|
478 |
|
479 @inproceedings{Becchi2007, |
|
480 author = {M.~Becchi and P.~Crowley}, |
|
481 title = {An Improved Algorithm to Accelerate Regular Expression Evaluation}, |
|
482 year = {2007}, |
|
483 isbn = {9781595939456}, |
|
484 publisher = {Association for Computing Machinery}, |
|
485 address = {New York, NY, USA}, |
|
486 url = {https://doi.org/10.1145/1323548.1323573}, |
|
487 doi = {10.1145/1323548.1323573}, |
|
488 abstract = {Modern network intrusion detection systems need to perform regular expression matching at line rate in order to detect the occurrence of critical patterns in packet payloads. While deterministic finite automata (DFAs) allow this operation to be performed in linear time, they may exhibit prohibitive memory requirements. In [9], Kumar et al. propose Delayed Input DFAs (D2FAs), which provide a trade-off between the memory requirements of the compressed DFA and the number of states visited for each character processed, which corresponds directly to the memory bandwidth required to evaluate regular expressions.In this paper we introduce a general compression technique that results in at most 2N state traversals when processing a string of length N. In comparison to the D2FA approach, our technique achieves comparable levels of compression, with lower provable bounds on memory bandwidth (or greater compression for a given bandwidth bound). Moreover, our proposed algorithm has lower complexity, is suitable for scenarios where a compressed DFA needs to be dynamically built or updated, and fosters locality in the traversal process. Finally, we also describe a novel alphabet reduction scheme for DFA-based structures that can yield further dramatic reductions in data structure size.}, |
|
489 booktitle = {Proceedings of the 3rd ACM/IEEE Symposium on Architecture for Networking and Communications Systems}, |
|
490 pages = {145–154}, |
|
491 numpages = {10}, |
|
492 keywords = {regular expressions, DFA, deep packet inspection}, |
|
493 location = {Orlando, Florida, USA}, |
|
494 series = {ANCS '07} |
|
495 } |
411 |
496 |
412 @inproceedings{Becchi08, |
497 @inproceedings{Becchi08, |
413 author = {Becchi, Michela and Crowley, Patrick}, |
498 author = {M.~Becchi and , Patrick}, |
414 doi = {10.1145/1544012.1544037}, |
499 doi = {10.1145/1544012.1544037}, |
415 month = {01}, |
500 month = {01}, |
416 pages = {25}, |
501 pages = {25}, |
417 title = {Extending finite automata to efficiently match Perl-compatible regular expressions}, |
502 title = {Extending finite automata to efficiently match Perl-compatible regular expressions}, |
418 year = {2008}, |
503 year = {2008}, |
442 year = {2021}, |
527 year = {2021}, |
443 bdsk-url-1 = {https://doi.org/10.1109/SPW53761.2021.00022}} |
528 bdsk-url-1 = {https://doi.org/10.1109/SPW53761.2021.00022}} |
444 |
529 |
445 |
530 |
446 @inproceedings{Nipkow1998, |
531 @inproceedings{Nipkow1998, |
447 author = {Tobias Nipkow}, |
532 author = {T.~Nipkow}, |
448 editor = {Jim Grundy and |
533 editor = {Jim Grundy and |
449 Malcolm C. Newey}, |
534 Malcolm C. Newey}, |
450 title = {Verified Lexical Analysis}, |
535 title = {Verified Lexical Analysis}, |
451 booktitle = {Theorem Proving in Higher Order Logics, 11th International Conference, |
536 booktitle = {Theorem Proving in Higher Order Logics, 11th International Conference, |
452 TPHOLs'98, Canberra, Australia, September 27 - October 1, 1998, Proceedings}, |
537 TPHOLs'98, Canberra, Australia, September 27 - October 1, 1998, Proceedings}, |