ChengsongTanPhdThesis/example.bib
changeset 627 94db2636a296
parent 626 1c8525061545
child 628 7af4e2420a8c
equal deleted inserted replaced
626:1c8525061545 627:94db2636a296
     4 %% Created for CS TAN at 2022-11-20 17:26:32 +0000 
     4 %% Created for CS TAN at 2022-11-20 17:26:32 +0000 
     5 
     5 
     6 
     6 
     7 %% Saved with string encoding Unicode (UTF-8) 
     7 %% Saved with string encoding Unicode (UTF-8) 
     8 @inproceedings{Doczkal2013,
     8 @inproceedings{Doczkal2013,
     9 author = {Doczkal, Christian and Kaiser, Jan-Oliver and Smolka, Gert},
     9 author = {C.~Doczkaland J.O.~Kaiser and G.~Smolka},
    10 title = {A Constructive Theory of Regular Languages in Coq},
    10 title = {A Constructive Theory of Regular Languages in Coq},
    11 year = {2013},
    11 year = {2013},
    12 isbn = {9783319035444},
    12 isbn = {9783319035444},
    13 publisher = {Springer-Verlag},
    13 publisher = {Springer-Verlag},
    14 address = {Berlin, Heidelberg},
    14 address = {Berlin, Heidelberg},
    19 pages = {82–97},
    19 pages = {82–97},
    20 numpages = {16},
    20 numpages = {16},
    21 keywords = {finite automata, regular expressions, Myhill-Nerode, Ssreflect, Coq, regular languages}
    21 keywords = {finite automata, regular expressions, Myhill-Nerode, Ssreflect, Coq, regular languages}
    22 }
    22 }
    23 
    23 
    24 
    24 @inproceedings{Lammich2012,
       
    25 author = {P.~Lammichand T.~Tuerk},
       
    26 year = {2012},
       
    27 month = {08},
       
    28 pages = {166-182},
       
    29 title = {Applying Data Refinement for Monadic Programs to Hopcroft's Algorithm},
       
    30 isbn = {978-3-642-32346-1},
       
    31 doi = {10.1007/978-3-642-32347-8_12}
       
    32 }
    25 
    33 
    26 @article{Krauss2012,
    34 @article{Krauss2012,
    27 	author={Alexander Krauss and Tobias Nipkow},
    35 	author={A.~Krauss and T.~Nipkow},
    28 title={Proof Pearl: Regular Expression Equivalence and Relation Algebra},
    36 title={Proof Pearl: Regular Expression Equivalence and Relation Algebra},
    29 journal={J. Automated Reasoning},volume=49,pages={95--106},year=2012,note={published online March 2011}}
    37 journal={J. Automated Reasoning},volume=49,pages={95--106},year=2012,note={published online March 2011}}
    30 
    38 
    31 
    39 
    32 
    40 
    36   journal={Automata studies},
    44   journal={Automata studies},
    37   volume={34},
    45   volume={34},
    38   pages={3--41},
    46   pages={3--41},
    39   year={1956},
    47   year={1956},
    40   publisher={Princeton, NJ}
    48   publisher={Princeton, NJ}
    41 }
       
    42 
       
    43 @inproceedings{Sailesh2006,
       
    44 author = {K.~Sailesh and D.~Sarang and Y.~Fang and C.~Patrick and T.~Jonathan},
       
    45 title = {Algorithms to Accelerate Multiple Regular Expressions Matching for Deep Packet Inspection},
       
    46 year = {2006},
       
    47 isbn = {1595933085},
       
    48 publisher = {Association for Computing Machinery},
       
    49 address = {New York, NY, USA},
       
    50 url = {https://doi.org/10.1145/1159913.1159952},
       
    51 doi = {10.1145/1159913.1159952},
       
    52 abstract = {There is a growing demand for network devices capable of examining the content of data packets in order to improve network security and provide application-specific services. Most high performance systems that perform deep packet inspection implement simple string matching algorithms to match packets against a large, but finite set of strings. owever, there is growing interest in the use of regular expression-based pattern matching, since regular expressions offer superior expressive power and flexibility. Deterministic finite automata (DFA) representations are typically used to implement regular expressions. However, DFA representations of regular expression sets arising in network applications require large amounts of memory, limiting their practical application.In this paper, we introduce a new representation for regular expressions, called the Delayed Input DFA (D2FA), which substantially reduces space equirements as compared to a DFA. A D2FA is constructed by transforming a DFA via incrementally replacing several transitions of the automaton with a single default transition. Our approach dramatically reduces the number of distinct transitions between states. For a collection of regular expressions drawn from current commercial and academic systems, a D2FA representation reduces transitions by more than 95%. Given the substantially reduced space equirements, we describe an efficient architecture that can perform deep packet inspection at multi-gigabit rates. Our architecture uses multiple on-chip memories in such a way that each remains uniformly occupied and accessed over a short duration, thus effectively distributing the load and enabling high throughput. Our architecture can provide ostffective packet content scanning at OC-192 rates with memory requirements that are consistent with current ASIC technology.},
       
    53 booktitle = {Proceedings of the 2006 Conference on Applications, Technologies, Architectures, and Protocols for Computer Communications},
       
    54 pages = {339–350},
       
    55 numpages = {12},
       
    56 keywords = {deep packet inspection, regular expressions, DFA},
       
    57 location = {Pisa, Italy},
       
    58 series = {SIGCOMM '06}
       
    59 }
    49 }
    60 
    50 
    61 
    51 
    62 
    52 
    63 @article{Murugesan2014,
    53 @article{Murugesan2014,
   100 	bdsk-url-1 = {https://doi.org/10.4204/EPTCS.151.7}}
    90 	bdsk-url-1 = {https://doi.org/10.4204/EPTCS.151.7}}
   101 
    91 
   102 @inproceedings{Berglund18,
    92 @inproceedings{Berglund18,
   103 	abstract = {Whereas Perl-compatible regular expression matchers typically exhibit some variation of leftmost-greedy semantics, those conforming to the posix standard are prescribed leftmost-longest semantics. However, the posix standard leaves some room for interpretation, and Fowler and Kuklewicz have done experimental work to confirm differences between various posix matchers. The Boost library has an interesting take on the posix standard, where it maximises the leftmost match not with respect to subexpressions of the regular expression pattern, but rather, with respect to capturing groups. In our work, we provide the first formalisation of Boost semantics, and we analyse the complexity of regular expression matching when using Boost semantics.},
    93 	abstract = {Whereas Perl-compatible regular expression matchers typically exhibit some variation of leftmost-greedy semantics, those conforming to the posix standard are prescribed leftmost-longest semantics. However, the posix standard leaves some room for interpretation, and Fowler and Kuklewicz have done experimental work to confirm differences between various posix matchers. The Boost library has an interesting take on the posix standard, where it maximises the leftmost match not with respect to subexpressions of the regular expression pattern, but rather, with respect to capturing groups. In our work, we provide the first formalisation of Boost semantics, and we analyse the complexity of regular expression matching when using Boost semantics.},
   104 	address = {Cham},
    94 	address = {Cham},
   105 	author = {M.~Berglund and Bester, Willem and van der Merwe, Brink},
    95 	author = {M.~Berglund and W.~Bester and van der M.~Brink},
   106 	booktitle = {Theoretical Aspects of Computing -- ICTAC 2018},
    96 	booktitle = {Theoretical Aspects of Computing -- ICTAC 2018},
   107 	editor = {Fischer, Bernd and Uustalu, Tarmo},
    97 	editor = {Fischer, Bernd and Uustalu, Tarmo},
   108 	isbn = {978-3-030-02508-3},
    98 	isbn = {978-3-030-02508-3},
   109 	pages = {99--115},
    99 	pages = {99--115},
   110 	publisher = {Springer International Publishing},
   100 	publisher = {Springer International Publishing},
   111 	title = {Formalising Boost POSIX Regular Expression Matching},
   101 	title = {Formalising Boost POSIX Regular Expression Matching},
   112 	year = {2018}}
   102 	year = {2018}}
   113 
   103 
   114 @inproceedings{Chen12,
   104 @inproceedings{Chen12,
   115 	author = {Chen, Haiming and Yu, Sheng},
   105 	author = {H.~Chenand S.~Yu},
   116 	doi = {10.1007/978-3-642-27654-5_27},
   106 	doi = {10.1007/978-3-642-27654-5_27},
   117 	month = {01},
   107 	month = {01},
   118 	pages = {343-356},
   108 	pages = {343-356},
   119 	title = {Derivatives of Regular Expressions and an Application},
   109 	title = {Derivatives of Regular Expressions and an Application},
   120 	volume = {7160},
   110 	volume = {7160},
   121 	year = {2012},
   111 	year = {2012},
   122 	bdsk-url-1 = {https://doi.org/10.1007/978-3-642-27654-5_27}}
   112 	bdsk-url-1 = {https://doi.org/10.1007/978-3-642-27654-5_27}}
   123 
   113 
   124 @article{Takayuki2019,
   114 @article{Takayuki2019,
   125 	author = {Takayuki Miyazaki and Yasuhiko Minamide},
   115 	author = {T.~Miyazaki and Y.~Minamide},
   126 	doi = {10.2197/ipsjjip.27.422},
   116 	doi = {10.2197/ipsjjip.27.422},
   127 	journal = {Journal of Information Processing},
   117 	journal = {Journal of Information Processing},
   128 	pages = {422-430},
   118 	pages = {422-430},
   129 	title = {Derivatives of Regular Expressions with Lookahead},
   119 	title = {Derivatives of Regular Expressions with Lookahead},
   130 	volume = {27},
   120 	volume = {27},
   164 	year = {2012},
   154 	year = {2012},
   165 	bdsk-url-1 = {https://doi.org/10.1007/978-3-642-31653-1_8}}
   155 	bdsk-url-1 = {https://doi.org/10.1007/978-3-642-31653-1_8}}
   166 
   156 
   167 @article{BERGLUND2022,
   157 @article{BERGLUND2022,
   168 	abstract = {Most modern regular expression matching libraries (one of the rare exceptions being Google's RE2) allow backreferences, operations which bind a substring to a variable, allowing it to be matched again verbatim. However, both real-world implementations and definitions in the literature use different syntactic restrictions and have differences in the semantics of the matching of backreferences. Our aim is to compare these various flavors by considering the classes of formal languages that each can describe, establishing, as a result, a hierarchy of language classes. Beyond the hierarchy itself, some complexity results are given, and as part of the effort on comparing language classes new pumping lemmas are established, old classes are extended to new ones, and several incidental results on the nature of these language classes are given.},
   158 	abstract = {Most modern regular expression matching libraries (one of the rare exceptions being Google's RE2) allow backreferences, operations which bind a substring to a variable, allowing it to be matched again verbatim. However, both real-world implementations and definitions in the literature use different syntactic restrictions and have differences in the semantics of the matching of backreferences. Our aim is to compare these various flavors by considering the classes of formal languages that each can describe, establishing, as a result, a hierarchy of language classes. Beyond the hierarchy itself, some complexity results are given, and as part of the effort on comparing language classes new pumping lemmas are established, old classes are extended to new ones, and several incidental results on the nature of these language classes are given.},
   169 	author = {Martin Berglund and Brink {van der Merwe}},
   159 	author = {M.~Berglund and van der M.~Brink },
   170 	doi = {https://doi.org/10.1016/j.tcs.2022.10.041},
   160 	doi = {https://doi.org/10.1016/j.tcs.2022.10.041},
   171 	issn = {0304-3975},
   161 	issn = {0304-3975},
   172 	journal = {Theoretical Computer Science},
   162 	journal = {Theoretical Computer Science},
   173 	keywords = {Regular expressions, Backreferences},
   163 	keywords = {Regular expressions, Backreferences},
   174 	title = {Re-examining regular expressions with backreferences},
   164 	title = {Re-examining regular expressions with backreferences},
   177 	bdsk-url-1 = {https://www.sciencedirect.com/science/article/pii/S0304397522006570},
   167 	bdsk-url-1 = {https://www.sciencedirect.com/science/article/pii/S0304397522006570},
   178 	bdsk-url-2 = {https://doi.org/10.1016/j.tcs.2022.10.041}}
   168 	bdsk-url-2 = {https://doi.org/10.1016/j.tcs.2022.10.041}}
   179 
   169 
   180 @article{FREYDENBERGER20191,
   170 @article{FREYDENBERGER20191,
   181 	abstract = {Most modern libraries for regular expression matching allow back-references (i.e., repetition operators) that substantially increase expressive power, but also lead to intractability. In order to find a better balance between expressiveness and tractability, we combine these with the notion of determinism for regular expressions used in XML DTDs and XML Schema. This includes the definition of a suitable automaton model, and a generalization of the Glushkov construction. We demonstrate that, compared to their non-deterministic superclass, these deterministic regular expressions with back-references have desirable algorithmic properties (i.e., efficiently solvable membership problem and some decidable problems in static analysis), while, at the same time, their expressive power exceeds that of deterministic regular expressions without back-references.},
   171 	abstract = {Most modern libraries for regular expression matching allow back-references (i.e., repetition operators) that substantially increase expressive power, but also lead to intractability. In order to find a better balance between expressiveness and tractability, we combine these with the notion of determinism for regular expressions used in XML DTDs and XML Schema. This includes the definition of a suitable automaton model, and a generalization of the Glushkov construction. We demonstrate that, compared to their non-deterministic superclass, these deterministic regular expressions with back-references have desirable algorithmic properties (i.e., efficiently solvable membership problem and some decidable problems in static analysis), while, at the same time, their expressive power exceeds that of deterministic regular expressions without back-references.},
   182 	author = {Dominik D. Freydenberger and Markus L. Schmid},
   172 	author = {D.D.~Freydenberger and M.L.~Schmid},
   183 	doi = {https://doi.org/10.1016/j.jcss.2019.04.001},
   173 	doi = {https://doi.org/10.1016/j.jcss.2019.04.001},
   184 	issn = {0022-0000},
   174 	issn = {0022-0000},
   185 	journal = {Journal of Computer and System Sciences},
   175 	journal = {Journal of Computer and System Sciences},
   186 	keywords = {Deterministic regular expression, Regex, Glushkov automaton},
   176 	keywords = {Deterministic regular expression, Regex, Glushkov automaton},
   187 	pages = {1-39},
   177 	pages = {1-39},
   193 	bdsk-url-2 = {https://doi.org/10.1016/j.jcss.2019.04.001}}
   183 	bdsk-url-2 = {https://doi.org/10.1016/j.jcss.2019.04.001}}
   194 
   184 
   195 @inproceedings{Frey2013,
   185 @inproceedings{Frey2013,
   196 	address = {Dagstuhl, Germany},
   186 	address = {Dagstuhl, Germany},
   197 	annote = {Keywords: extended regular expressions, regex, decidability, non-recursive tradeoffs},
   187 	annote = {Keywords: extended regular expressions, regex, decidability, non-recursive tradeoffs},
   198 	author = {Dominik D. Freydenberger},
   188 	author = {D.D.~Freydenberger},
   199 	booktitle = {28th International Symposium on Theoretical Aspects of Computer Science (STACS 2011)},
   189 	booktitle = {28th International Symposium on Theoretical Aspects of Computer Science (STACS 2011)},
   200 	doi = {10.4230/LIPIcs.STACS.2011.507},
   190 	doi = {10.4230/LIPIcs.STACS.2011.507},
   201 	editor = {Thomas Schwentick and Christoph D{\"u}rr},
   191 	editor = {Thomas Schwentick and Christoph D{\"u}rr},
   202 	isbn = {978-3-939897-25-5},
   192 	isbn = {978-3-939897-25-5},
   203 	issn = {1868-8969},
   193 	issn = {1868-8969},
   233 	year = {2009},
   223 	year = {2009},
   234 	bdsk-url-1 = {https://doi.org/10.1007/s00236-009-0090-y}}
   224 	bdsk-url-1 = {https://doi.org/10.1007/s00236-009-0090-y}}
   235 
   225 
   236 @article{CAMPEANU2009Intersect,
   226 @article{CAMPEANU2009Intersect,
   237 	abstract = {In this paper we revisit the semantics of extended regular expressions (regex), defined succinctly in the 90s [A.V. Aho, Algorithms for finding patterns in strings, in: Jan van Leeuwen (Ed.), Handbook of Theoretical Computer Science, in: Algorithms and Complexity, vol. A, Elsevier and MIT Press, 1990, pp. 255--300] and rigorously in 2003 by C{\^a}mpeanu, Salomaa and Yu [C. C{\^a}mpeanu, K. Salomaa, S. Yu, A formal study of practical regular expressions, IJFCS 14 (6) (2003) 1007--1018], when the authors reported an open problem, namely whether regex languages are closed under the intersection with regular languages. We give a positive answer; and for doing so, we propose a new class of machines --- regex automata systems (RAS) --- which are equivalent to regex. Among others, these machines provide a consistent and convenient method of implementing regex in practice. We also prove, as a consequence of this closure property, that several languages, such as the mirror language, the language of palindromes, and the language of balanced words are not regex languages.},
   227 	abstract = {In this paper we revisit the semantics of extended regular expressions (regex), defined succinctly in the 90s [A.V. Aho, Algorithms for finding patterns in strings, in: Jan van Leeuwen (Ed.), Handbook of Theoretical Computer Science, in: Algorithms and Complexity, vol. A, Elsevier and MIT Press, 1990, pp. 255--300] and rigorously in 2003 by C{\^a}mpeanu, Salomaa and Yu [C. C{\^a}mpeanu, K. Salomaa, S. Yu, A formal study of practical regular expressions, IJFCS 14 (6) (2003) 1007--1018], when the authors reported an open problem, namely whether regex languages are closed under the intersection with regular languages. We give a positive answer; and for doing so, we propose a new class of machines --- regex automata systems (RAS) --- which are equivalent to regex. Among others, these machines provide a consistent and convenient method of implementing regex in practice. We also prove, as a consequence of this closure property, that several languages, such as the mirror language, the language of palindromes, and the language of balanced words are not regex languages.},
   238 	author = {Cezar C{\^a}mpeanu and Nicolae Santean},
   228 	author = {C.~C{\^a}mpeanu and N.~Santean},
   239 	doi = {https://doi.org/10.1016/j.tcs.2009.02.022},
   229 	doi = {https://doi.org/10.1016/j.tcs.2009.02.022},
   240 	issn = {0304-3975},
   230 	issn = {0304-3975},
   241 	journal = {Theoretical Computer Science},
   231 	journal = {Theoretical Computer Science},
   242 	keywords = {Extended regular expression, Regex automata system, Regex},
   232 	keywords = {Extended regular expression, Regex automata system, Regex},
   243 	note = {Formal Languages and Applications: A Collection of Papers in Honor of Sheng Yu},
   233 	note = {Formal Languages and Applications: A Collection of Papers in Honor of Sheng Yu},
   284 	title = {Parsing with Derivatives: A Functional Pearl},
   274 	title = {Parsing with Derivatives: A Functional Pearl},
   285 	url = {https://doi.org/10.1145/2034773.2034801},
   275 	url = {https://doi.org/10.1145/2034773.2034801},
   286 	year = {2011},
   276 	year = {2011},
   287 	bdsk-url-1 = {https://doi.org/10.1145/2034773.2034801}}
   277 	bdsk-url-1 = {https://doi.org/10.1145/2034773.2034801}}
   288 
   278 
   289 @article{10.1145/2034574.2034801,
       
   290 	abstract = {We present a functional approach to parsing unrestricted context-free grammars based on Brzozowski's derivative of regular expressions. If we consider context-free grammars as recursive regular expressions, Brzozowski's equational theory extends without modification to context-free grammars (and it generalizes to parser combinators). The supporting actors in this story are three concepts familiar to functional programmers - laziness, memoization and fixed points; these allow Brzozowski's original equations to be transliterated into purely functional code in about 30 lines spread over three functions.Yet, this almost impossibly brief implementation has a drawback: its performance is sour - in both theory and practice. The culprit? Each derivative can double the size of a grammar, and with it, the cost of the next derivative.Fortunately, much of the new structure inflicted by the derivative is either dead on arrival, or it dies after the very next derivative. To eliminate it, we once again exploit laziness and memoization to transliterate an equational theory that prunes such debris into working code. Thanks to this compaction, parsing times become reasonable in practice.We equip the functional programmer with two equational theories that, when combined, make for an abbreviated understanding and implementation of a system for parsing context-free languages.},
       
   291 	address = {New York, NY, USA},
       
   292 	author = {Might, Matthew and Darais, David and Spiewak, Daniel},
       
   293 	doi = {10.1145/2034574.2034801},
       
   294 	issn = {0362-1340},
       
   295 	issue_date = {September 2011},
       
   296 	journal = {SIGPLAN Not.},
       
   297 	keywords = {formal languages, derivative, context-free grammar, regular expressions, parsing, parser combinator},
       
   298 	month = {sep},
       
   299 	number = {9},
       
   300 	numpages = {7},
       
   301 	pages = {189--195},
       
   302 	publisher = {Association for Computing Machinery},
       
   303 	title = {Parsing with Derivatives: A Functional Pearl},
       
   304 	url = {https://doi.org/10.1145/2034574.2034801},
       
   305 	volume = {46},
       
   306 	year = {2011},
       
   307 	bdsk-url-1 = {https://doi.org/10.1145/2034574.2034801}}
       
   308 
       
   309 
   279 
   310 
   280 
   311 @inproceedings{Adams2016,
   281 @inproceedings{Adams2016,
   312   title={On the complexity and performance of parsing with derivatives},
   282   title={On the complexity and performance of parsing with derivatives},
   313   author={Adams, Michael D and Hollenbeck, Celeste and Might, Matthew},
   283   author={M.D.~Adams and C.~Hollenbeck and M.~MightMatthew},
   314   booktitle={Proceedings of the 37th ACM SIGPLAN Conference on Programming Language Design and Implementation},
   284   booktitle={Proceedings of the 37th ACM SIGPLAN Conference on Programming Language Design and Implementation},
   315   pages={224--236},
   285   pages={224--236},
   316   year={2016}
   286   year={2016}
   317 }
   287 }
   318 
   288 @article{Huet1997,
       
   289 author = {G.~Huet},
       
   290 title = {The Zipper},
       
   291 year = {1997},
       
   292 issue_date = {September 1997},
       
   293 publisher = {Cambridge University Press},
       
   294 address = {USA},
       
   295 volume = {7},
       
   296 number = {5},
       
   297 issn = {0956-7968},
       
   298 url = {https://doi.org/10.1017/S0956796897002864},
       
   299 doi = {10.1017/S0956796897002864},
       
   300 abstract = {Almost every programmer has faced the problem of representing a tree together with a subtree that is the focus of attention, where that focus may move left, right, up or down the tree. The Zipper is Huet's nifty name for a nifty data structure which fulfills this need. I wish I had known of it when I faced this task, because the solution I came up with was not quite so efficient or elegant as the Zipper.},
       
   301 journal = {J. Funct. Program.},
       
   302 month = {sep},
       
   303 pages = {549–554},
       
   304 numpages = {6}
       
   305 }
   319 
   306 
   320 @article{Darragh2020,
   307 @article{Darragh2020,
   321 	abstract = {Parsing with Derivatives (PwD) is an elegant approach to parsing context-free grammars (CFGs). It takes the equational theory behind Brzozowski's derivative for regular expressions and augments that theory with laziness, memoization, and fixed points. The result is a simple parser for arbitrary CFGs. Although recent work improved the performance of PwD, it remains inefficient due to the algorithm repeatedly traversing some parts of the grammar. In this functional pearl, we show how to avoid this inefficiency by suspending the state of the traversal in a zipper. When subsequent derivatives are taken, we can resume the traversal from where we left off without retraversing already traversed parts of the grammar. However, the original zipper is designed for use with trees, and we want to parse CFGs. CFGs can include shared regions, cycles, and choices between alternates, which makes them incompatible with the traditional tree model for zippers. This paper develops a generalization of zippers to properly handle these additional features. Just as PwD generalized Brzozowski's derivatives from regular expressions to CFGs, we generalize Huet's zippers from trees to CFGs. Abstract The resulting parsing algorithm is concise and efficient: it takes only 31 lines of OCaml code to implement the derivative function but performs 6,500 times faster than the original PwD and 3.24 times faster than the optimized implementation of PwD.},
   308 	abstract = {Parsing with Derivatives (PwD) is an elegant approach to parsing context-free grammars (CFGs). It takes the equational theory behind Brzozowski's derivative for regular expressions and augments that theory with laziness, memoization, and fixed points. The result is a simple parser for arbitrary CFGs. Although recent work improved the performance of PwD, it remains inefficient due to the algorithm repeatedly traversing some parts of the grammar. In this functional pearl, we show how to avoid this inefficiency by suspending the state of the traversal in a zipper. When subsequent derivatives are taken, we can resume the traversal from where we left off without retraversing already traversed parts of the grammar. However, the original zipper is designed for use with trees, and we want to parse CFGs. CFGs can include shared regions, cycles, and choices between alternates, which makes them incompatible with the traditional tree model for zippers. This paper develops a generalization of zippers to properly handle these additional features. Just as PwD generalized Brzozowski's derivatives from regular expressions to CFGs, we generalize Huet's zippers from trees to CFGs. Abstract The resulting parsing algorithm is concise and efficient: it takes only 31 lines of OCaml code to implement the derivative function but performs 6,500 times faster than the original PwD and 3.24 times faster than the optimized implementation of PwD.},
   322 	address = {New York, NY, USA},
   309 	address = {New York, NY, USA},
   323 	articleno = {108},
   310 	articleno = {108},
   324 	author = {Darragh, Pierce and Adams, Michael D.},
   311 	author = {P.~Darragh and M.D.~Adams},
   325 	doi = {10.1145/3408990},
   312 	doi = {10.1145/3408990},
   326 	issue_date = {August 2020},
   313 	issue_date = {August 2020},
   327 	journal = {Proc. ACM Program. Lang.},
   314 	journal = {Proc. ACM Program. Lang.},
   328 	keywords = {Derivatives, Zippers, Parsing with Derivatives, Parsing},
   315 	keywords = {Derivatives, Zippers, Parsing with Derivatives, Parsing},
   329 	month = {aug},
   316 	month = {aug},
   341 Within the framework of the thesis, I present  parsing algorithms that are based on the concept of  Brzozowski's derivatives. Derivative-based parsing  algorithms present several advantages: they are elegant,  amenable to formal reasoning, and easy to implement.  Unfortunately, the performance of these algorithms in  practice is often not competitive with other approaches. In  this thesis, I show a general technique inspired by Huet's  Zipper to greatly enhance the performance of  derivative-based algorithms, and I do so without  compromising their elegance, amenability to formal  reasoning, or ease of implementation.
   328 Within the framework of the thesis, I present  parsing algorithms that are based on the concept of  Brzozowski's derivatives. Derivative-based parsing  algorithms present several advantages: they are elegant,  amenable to formal reasoning, and easy to implement.  Unfortunately, the performance of these algorithms in  practice is often not competitive with other approaches. In  this thesis, I show a general technique inspired by Huet's  Zipper to greatly enhance the performance of  derivative-based algorithms, and I do so without  compromising their elegance, amenability to formal  reasoning, or ease of implementation.
   342 First, I present a  technique for building efficient tokenisers that is based  on Brzozowski's derivatives and Huet's zipper and that does  not require the usual burdensome explicit conversion to  automata. I prove the technique is correct in Coq and  present SILEX, a Scala lexing library based on the  technique. I demonstrate that the approach is competitive  with state-of-the-art solutions.
   329 First, I present a  technique for building efficient tokenisers that is based  on Brzozowski's derivatives and Huet's zipper and that does  not require the usual burdensome explicit conversion to  automata. I prove the technique is correct in Coq and  present SILEX, a Scala lexing library based on the  technique. I demonstrate that the approach is competitive  with state-of-the-art solutions.
   343 Then, I present a  characterisation of LL(1) languages based on the concept of  should-not-follow sets. I present an algorithm for parsing  LL(1) languages with derivatives and zippers. I show a  formal proof of the algorithm's correctness and prove its  worst-case linear-time complexity. I show how the LL(1)  parsing with derivatives and zippers algorithm corresponds  to the traditional LL(1) parsing algorithm.
   330 Then, I present a  characterisation of LL(1) languages based on the concept of  should-not-follow sets. I present an algorithm for parsing  LL(1) languages with derivatives and zippers. I show a  formal proof of the algorithm's correctness and prove its  worst-case linear-time complexity. I show how the LL(1)  parsing with derivatives and zippers algorithm corresponds  to the traditional LL(1) parsing algorithm.
   344 I then present  SCALL1ON, a Scala parsing combinators library for LL(1)  languages that incorporates the LL(1) parsing with  derivatives and zippers algorithm. I present an expressive  and familiar combinator-based interface for describing  LL(1) languages. I present techniques that help precisely  locate LL(1) conflicts in user code. I discuss several  advantages of the parsing with derivatives approach within  the context of a parsing library. I also present SCALL1ON's  enumeration and pretty-printing features and discuss their  implementation. Through a series of benchmarks, I  demonstrate the good performance and practicality of the  approach. Finally, I present how to adapt the LL(1) parsing  with derivatives and zippers algorithm to support arbitrary  context-free languages. I show how the adapted algorithm  corresponds to general parsing algorithms, such as Earley's  parsing algorithm.},
   331 I then present  SCALL1ON, a Scala parsing combinators library for LL(1)  languages that incorporates the LL(1) parsing with  derivatives and zippers algorithm. I present an expressive  and familiar combinator-based interface for describing  LL(1) languages. I present techniques that help precisely  locate LL(1) conflicts in user code. I discuss several  advantages of the parsing with derivatives approach within  the context of a parsing library. I also present SCALL1ON's  enumeration and pretty-printing features and discuss their  implementation. Through a series of benchmarks, I  demonstrate the good performance and practicality of the  approach. Finally, I present how to adapt the LL(1) parsing  with derivatives and zippers algorithm to support arbitrary  context-free languages. I show how the adapted algorithm  corresponds to general parsing algorithms, such as Earley's  parsing algorithm.},
   345 	address = {Lausanne},
   332 	address = {Lausanne},
   346 	author = {Edelmann, Romain},
   333 	author = {R.~Edelmann},
   347 	doi = {10.5075/epfl-thesis-7357},
   334 	doi = {10.5075/epfl-thesis-7357},
   348 	institution = {IINFCOM},
   335 	institution = {IINFCOM},
   349 	pages = {246},
   336 	pages = {246},
   350 	publisher = {EPFL},
   337 	publisher = {EPFL},
   351 	title = {Efficient Parsing with Derivatives and Zippers},
   338 	title = {Efficient Parsing with Derivatives and Zippers},
   352 	url = {http://infoscience.epfl.ch/record/287059},
   339 	url = {http://infoscience.epfl.ch/record/287059},
   353 	year = {2021},
   340 	year = {2021},
   354 	bdsk-url-1 = {http://infoscience.epfl.ch/record/287059},
   341 	bdsk-url-1 = {http://infoscience.epfl.ch/record/287059},
   355 	bdsk-url-2 = {https://doi.org/10.5075/epfl-thesis-7357}}
   342 	bdsk-url-2 = {https://doi.org/10.5075/epfl-thesis-7357}}
   356 
   343 
       
   344 
       
   345 
       
   346 @inproceedings{Paulson2015,,
       
   347 author = {L.~Paulson},
       
   348 year = {2015},
       
   349 month = {05},
       
   350 pages = {},
       
   351 title = {A Formalisation of Finite Automata Using Hereditarily Finite Sets},
       
   352 isbn = {978-3-319-21400-9},
       
   353 doi = {10.1007/978-3-319-21401-6_15}
       
   354 }
       
   355 
   357 @inproceedings{Zippy2020,
   356 @inproceedings{Zippy2020,
   358 	abstract = {In this paper, we present an efficient, functional, and formally verified parsing algorithm for LL(1) context-free expressions based on the concept of derivatives of formal languages. Parsing with derivatives is an elegant parsing technique, which, in the general case, suffers from cubic worst-case time complexity and slow performance in practice. We specialise the parsing with derivatives algorithm to LL(1) context-free expressions, where alternatives can be chosen given a single token of lookahead. We formalise the notion of LL(1) expressions and show how to efficiently check the LL(1) property. Next, we present a novel linear-time parsing with derivatives algorithm for LL(1) expressions operating on a zipper-inspired data structure. We prove the algorithm correct in Coq and present an implementation as a part of Scallion, a parser combinators framework in Scala with enumeration and pretty printing capabilities.},
   357 	abstract = {In this paper, we present an efficient, functional, and formally verified parsing algorithm for LL(1) context-free expressions based on the concept of derivatives of formal languages. Parsing with derivatives is an elegant parsing technique, which, in the general case, suffers from cubic worst-case time complexity and slow performance in practice. We specialise the parsing with derivatives algorithm to LL(1) context-free expressions, where alternatives can be chosen given a single token of lookahead. We formalise the notion of LL(1) expressions and show how to efficiently check the LL(1) property. Next, we present a novel linear-time parsing with derivatives algorithm for LL(1) expressions operating on a zipper-inspired data structure. We prove the algorithm correct in Coq and present an implementation as a part of Scallion, a parser combinators framework in Scala with enumeration and pretty printing capabilities.},
   359 	address = {New York, NY, USA},
   358 	address = {New York, NY, USA},
   360 	author = {Edelmann, Romain and Hamza, Jad and Kun\v{c}ak, Viktor},
   359 	author = {R.~Edelmannand H.~Jad and K.~Viktor},
   361 	booktitle = {Proceedings of the 41st ACM SIGPLAN Conference on Programming Language Design and Implementation},
   360 	booktitle = {Proceedings of the 41st ACM SIGPLAN Conference on Programming Language Design and Implementation},
   362 	doi = {10.1145/3385412.3385992},
   361 	doi = {10.1145/3385412.3385992},
   363 	isbn = {9781450376136},
   362 	isbn = {9781450376136},
   364 	keywords = {Parsing, Zipper, Formal proof, LL(1), Derivatives},
   363 	keywords = {Parsing, Zipper, Formal proof, LL(1), Derivatives},
   365 	location = {London, UK},
   364 	location = {London, UK},
   371 	url = {https://doi.org/10.1145/3385412.3385992},
   370 	url = {https://doi.org/10.1145/3385412.3385992},
   372 	year = {2020},
   371 	year = {2020},
   373 	bdsk-url-1 = {https://doi.org/10.1145/3385412.3385992}}
   372 	bdsk-url-1 = {https://doi.org/10.1145/3385412.3385992}}
   374 
   373 
   375 @article{fowler2003,
   374 @article{fowler2003,
   376 	author = {Fowler, Glenn},
   375 	author = {G.~Fowler},
   377 	journal = {URL: https://web. archive. org/web/20050408073627/http://www. research. att. com/\~{} gsf/testregex/re-interpretation. html},
   376 	journal = {URL: https://web. archive. org/web/20050408073627/http://www. research. att. com/\~{} gsf/testregex/re-interpretation. html},
   378 	title = {An interpretation of the POSIX regex standard},
   377 	title = {An interpretation of the POSIX regex standard},
   379 	year = {2003}}
   378 	year = {2003}}
   380 
   379 
   381 @inproceedings{Snort1999,
   380 @inproceedings{Snort1999,
   390 	series = {LISA '99},
   389 	series = {LISA '99},
   391 	title = {Snort - Lightweight Intrusion Detection for Networks},
   390 	title = {Snort - Lightweight Intrusion Detection for Networks},
   392 	year = {1999}}
   391 	year = {1999}}
   393 
   392 
   394 
   393 
       
   394 @article{Reps1998,
       
   395 author = {T.~Reps},
       
   396 title = {“Maximal-Munch” Tokenization in Linear Time},
       
   397 year = {1998},
       
   398 issue_date = {March 1998},
       
   399 publisher = {Association for Computing Machinery},
       
   400 address = {New York, NY, USA},
       
   401 volume = {20},
       
   402 number = {2},
       
   403 issn = {0164-0925},
       
   404 url = {https://doi.org/10.1145/276393.276394},
       
   405 doi = {10.1145/276393.276394},
       
   406 abstract = {The lexical-analysis (or scanning) phase of a compiler attempts to partition an input string into a sequence of tokens. The convention in most languages is that the input is scanned left to right, and each token identified is a “maximal munch” of the remaining input—the longest prefix of the remaining input that is a token of the language. Although most of the standard compiler textbooks present a way to perform maximal-munch tokenization, the algorithm they describe is one that, for certain sets of token definitions, can cause the scanner to exhibit quadratic behavior in the worst case. In the article, we show that maximal-munch tokenization can always be performed in time linear in the size of the input.},
       
   407 journal = {ACM Trans. Program. Lang. Syst.},
       
   408 month = {mar},
       
   409 pages = {259–273},
       
   410 numpages = {15},
       
   411 keywords = {backtracking, tokenization, tabulation, dynamic programming, memoization}
       
   412 }
   395 
   413 
   396 @inproceedings{Bro,
   414 @inproceedings{Bro,
   397 author = {V.~Paxson},
   415 author = {V.~Paxson},
   398 title = {Bro: A System for Detecting Network Intruders in Real-Time},
   416 title = {Bro: A System for Detecting Network Intruders in Real-Time},
   399 year = {1998},
   417 year = {1998},
   405 numpages = {1},
   423 numpages = {1},
   406 location = {San Antonio, Texas},
   424 location = {San Antonio, Texas},
   407 series = {SSYM'98}
   425 series = {SSYM'98}
   408 }
   426 }
   409 
   427 
   410 
   428 @INPROCEEDINGS{Sidhu2001,
       
   429   author={Sidhu, R. and Prasanna, V.K.},
       
   430   booktitle={The 9th Annual IEEE Symposium on Field-Programmable Custom Computing Machines (FCCM'01)}, 
       
   431   title={Fast Regular Expression Matching Using FPGAs}, 
       
   432   year={2001},
       
   433   volume={},
       
   434   number={},
       
   435   pages={227-238},
       
   436   doi={}
       
   437 }
       
   438 @article{Turonova2020,
       
   439 author = {L.~Turo\v{n}ov\'{a} and L.~Hol\'{\i}k  and O.~Leng\'{a}l and O.~Saarikivi and M.~Veanes and T.~Vojnar },
       
   440 title = {Regex Matching with Counting-Set Automata},
       
   441 year = {2020},
       
   442 issue_date = {November 2020},
       
   443 publisher = {Association for Computing Machinery},
       
   444 address = {New York, NY, USA},
       
   445 volume = {4},
       
   446 number = {OOPSLA},
       
   447 url = {https://doi.org/10.1145/3428286},
       
   448 doi = {10.1145/3428286},
       
   449 abstract = {We propose a solution to the problem of efficient matching regular expressions (regexes) with bounded repetition, such as (ab){1,100}, using deterministic automata. For this, we introduce novel counting-set automata (CsAs), automata with registers that can hold sets of bounded integers and can be manipulated by a limited portfolio of constant-time operations. We present an algorithm that compiles a large sub-class of regexes to deterministic CsAs. This includes (1) a novel Antimirov-style translation of regexes with counting to counting automata (CAs), nondeterministic automata with bounded counters, and (2) our main technical contribution, a determinization of CAs that outputs CsAs. The main advantage of this workflow is that the size of the produced CsAs does not depend on the repetition bounds used in the regex (while the size of the DFA is exponential to them). Our experimental results confirm that deterministic CsAs produced from practical regexes with repetition are indeed vastly smaller than the corresponding DFAs. More importantly, our prototype matcher based on CsA simulation handles practical regexes with repetition regardless of sizes of counter bounds. It easily copes with regexes with repetition where state-of-the-art matchers struggle.},
       
   450 journal = {Proc. ACM Program. Lang.},
       
   451 month = {nov},
       
   452 articleno = {218},
       
   453 numpages = {30},
       
   454 keywords = {counting-set automata, counting automata, bounded repetition, ReDos, Antimirov's derivatives, regular expression matching, determinization}
       
   455 }
       
   456 
       
   457 @inproceedings{Kumar2006,
       
   458 author = {S.~Kumar and S.~Dharmapurikar and F.~Yu and P.~Crowley and J.~Turner},
       
   459 title = {Algorithms to Accelerate Multiple Regular Expressions Matching for Deep Packet Inspection},
       
   460 year = {2006},
       
   461 isbn = {1595933085},
       
   462 publisher = {Association for Computing Machinery},
       
   463 address = {New York, NY, USA},
       
   464 url = {https://doi.org/10.1145/1159913.1159952},
       
   465 doi = {10.1145/1159913.1159952},
       
   466 abstract = {There is a growing demand for network devices capable of examining the content of data packets in order to improve network security and provide application-specific services. Most high performance systems that perform deep packet inspection implement simple string matching algorithms to match packets against a large, but finite set of strings. owever, there is growing interest in the use of regular expression-based pattern matching, since regular expressions offer superior expressive power and flexibility. Deterministic finite automata (DFA) representations are typically used to implement regular expressions. However, DFA representations of regular expression sets arising in network applications require large amounts of memory, limiting their practical application.In this paper, we introduce a new representation for regular expressions, called the Delayed Input DFA (D2FA), which substantially reduces space equirements as compared to a DFA. A D2FA is constructed by transforming a DFA via incrementally replacing several transitions of the automaton with a single default transition. Our approach dramatically reduces the number of distinct transitions between states. For a collection of regular expressions drawn from current commercial and academic systems, a D2FA representation reduces transitions by more than 95%. Given the substantially reduced space equirements, we describe an efficient architecture that can perform deep packet inspection at multi-gigabit rates. Our architecture uses multiple on-chip memories in such a way that each remains uniformly occupied and accessed over a short duration, thus effectively distributing the load and enabling high throughput. Our architecture can provide ostffective packet content scanning at OC-192 rates with memory requirements that are consistent with current ASIC technology.},
       
   467 booktitle = {Proceedings of the 2006 Conference on Applications, Technologies, Architectures, and Protocols for Computer Communications},
       
   468 pages = {339–350},
       
   469 numpages = {12},
       
   470 keywords = {regular expressions, DFA, deep packet inspection},
       
   471 location = {Pisa, Italy},
       
   472 series = {SIGCOMM '06}
       
   473 }
       
   474 
       
   475 
       
   476 
       
   477 
       
   478 
       
   479 @inproceedings{Becchi2007,
       
   480 author = {M.~Becchi and P.~Crowley},
       
   481 title = {An Improved Algorithm to Accelerate Regular Expression Evaluation},
       
   482 year = {2007},
       
   483 isbn = {9781595939456},
       
   484 publisher = {Association for Computing Machinery},
       
   485 address = {New York, NY, USA},
       
   486 url = {https://doi.org/10.1145/1323548.1323573},
       
   487 doi = {10.1145/1323548.1323573},
       
   488 abstract = {Modern network intrusion detection systems need to perform regular expression matching at line rate in order to detect the occurrence of critical patterns in packet payloads. While deterministic finite automata (DFAs) allow this operation to be performed in linear time, they may exhibit prohibitive memory requirements. In [9], Kumar et al. propose Delayed Input DFAs (D2FAs), which provide a trade-off between the memory requirements of the compressed DFA and the number of states visited for each character processed, which corresponds directly to the memory bandwidth required to evaluate regular expressions.In this paper we introduce a general compression technique that results in at most 2N state traversals when processing a string of length N. In comparison to the D2FA approach, our technique achieves comparable levels of compression, with lower provable bounds on memory bandwidth (or greater compression for a given bandwidth bound). Moreover, our proposed algorithm has lower complexity, is suitable for scenarios where a compressed DFA needs to be dynamically built or updated, and fosters locality in the traversal process. Finally, we also describe a novel alphabet reduction scheme for DFA-based structures that can yield further dramatic reductions in data structure size.},
       
   489 booktitle = {Proceedings of the 3rd ACM/IEEE Symposium on Architecture for Networking and Communications Systems},
       
   490 pages = {145–154},
       
   491 numpages = {10},
       
   492 keywords = {regular expressions, DFA, deep packet inspection},
       
   493 location = {Orlando, Florida, USA},
       
   494 series = {ANCS '07}
       
   495 }
   411 
   496 
   412 @inproceedings{Becchi08,
   497 @inproceedings{Becchi08,
   413 	author = {Becchi, Michela and Crowley, Patrick},
   498 	author = {M.~Becchi and , Patrick},
   414 	doi = {10.1145/1544012.1544037},
   499 	doi = {10.1145/1544012.1544037},
   415 	month = {01},
   500 	month = {01},
   416 	pages = {25},
   501 	pages = {25},
   417 	title = {Extending finite automata to efficiently match Perl-compatible regular expressions},
   502 	title = {Extending finite automata to efficiently match Perl-compatible regular expressions},
   418 	year = {2008},
   503 	year = {2008},
   442 	year = {2021},
   527 	year = {2021},
   443 	bdsk-url-1 = {https://doi.org/10.1109/SPW53761.2021.00022}}
   528 	bdsk-url-1 = {https://doi.org/10.1109/SPW53761.2021.00022}}
   444 
   529 
   445 
   530 
   446 @inproceedings{Nipkow1998,
   531 @inproceedings{Nipkow1998,
   447   author    = {Tobias Nipkow},
   532   author    = {T.~Nipkow},
   448   editor    = {Jim Grundy and
   533   editor    = {Jim Grundy and
   449                Malcolm C. Newey},
   534                Malcolm C. Newey},
   450   title     = {Verified Lexical Analysis},
   535   title     = {Verified Lexical Analysis},
   451   booktitle = {Theorem Proving in Higher Order Logics, 11th International Conference,
   536   booktitle = {Theorem Proving in Higher Order Logics, 11th International Conference,
   452                TPHOLs'98, Canberra, Australia, September 27 - October 1, 1998, Proceedings},
   537                TPHOLs'98, Canberra, Australia, September 27 - October 1, 1998, Proceedings},