--- a/ChengsongTanPhdThesis/example.bib Mon Nov 21 23:56:15 2022 +0000
+++ b/ChengsongTanPhdThesis/example.bib Tue Nov 22 12:50:04 2022 +0000
@@ -6,7 +6,7 @@
%% Saved with string encoding Unicode (UTF-8)
@inproceedings{Doczkal2013,
-author = {Doczkal, Christian and Kaiser, Jan-Oliver and Smolka, Gert},
+author = {C.~Doczkaland J.O.~Kaiser and G.~Smolka},
title = {A Constructive Theory of Regular Languages in Coq},
year = {2013},
isbn = {9783319035444},
@@ -21,10 +21,18 @@
keywords = {finite automata, regular expressions, Myhill-Nerode, Ssreflect, Coq, regular languages}
}
-
+@inproceedings{Lammich2012,
+author = {P.~Lammichand T.~Tuerk},
+year = {2012},
+month = {08},
+pages = {166-182},
+title = {Applying Data Refinement for Monadic Programs to Hopcroft's Algorithm},
+isbn = {978-3-642-32346-1},
+doi = {10.1007/978-3-642-32347-8_12}
+}
@article{Krauss2012,
- author={Alexander Krauss and Tobias Nipkow},
+ author={A.~Krauss and T.~Nipkow},
title={Proof Pearl: Regular Expression Equivalence and Relation Algebra},
journal={J. Automated Reasoning},volume=49,pages={95--106},year=2012,note={published online March 2011}}
@@ -40,24 +48,6 @@
publisher={Princeton, NJ}
}
-@inproceedings{Sailesh2006,
-author = {K.~Sailesh and D.~Sarang and Y.~Fang and C.~Patrick and T.~Jonathan},
-title = {Algorithms to Accelerate Multiple Regular Expressions Matching for Deep Packet Inspection},
-year = {2006},
-isbn = {1595933085},
-publisher = {Association for Computing Machinery},
-address = {New York, NY, USA},
-url = {https://doi.org/10.1145/1159913.1159952},
-doi = {10.1145/1159913.1159952},
-abstract = {There is a growing demand for network devices capable of examining the content of data packets in order to improve network security and provide application-specific services. Most high performance systems that perform deep packet inspection implement simple string matching algorithms to match packets against a large, but finite set of strings. owever, there is growing interest in the use of regular expression-based pattern matching, since regular expressions offer superior expressive power and flexibility. Deterministic finite automata (DFA) representations are typically used to implement regular expressions. However, DFA representations of regular expression sets arising in network applications require large amounts of memory, limiting their practical application.In this paper, we introduce a new representation for regular expressions, called the Delayed Input DFA (D2FA), which substantially reduces space equirements as compared to a DFA. A D2FA is constructed by transforming a DFA via incrementally replacing several transitions of the automaton with a single default transition. Our approach dramatically reduces the number of distinct transitions between states. For a collection of regular expressions drawn from current commercial and academic systems, a D2FA representation reduces transitions by more than 95%. Given the substantially reduced space equirements, we describe an efficient architecture that can perform deep packet inspection at multi-gigabit rates. Our architecture uses multiple on-chip memories in such a way that each remains uniformly occupied and accessed over a short duration, thus effectively distributing the load and enabling high throughput. Our architecture can provide ostffective packet content scanning at OC-192 rates with memory requirements that are consistent with current ASIC technology.},
-booktitle = {Proceedings of the 2006 Conference on Applications, Technologies, Architectures, and Protocols for Computer Communications},
-pages = {339–350},
-numpages = {12},
-keywords = {deep packet inspection, regular expressions, DFA},
-location = {Pisa, Italy},
-series = {SIGCOMM '06}
-}
-
@article{Murugesan2014,
@@ -102,7 +92,7 @@
@inproceedings{Berglund18,
abstract = {Whereas Perl-compatible regular expression matchers typically exhibit some variation of leftmost-greedy semantics, those conforming to the posix standard are prescribed leftmost-longest semantics. However, the posix standard leaves some room for interpretation, and Fowler and Kuklewicz have done experimental work to confirm differences between various posix matchers. The Boost library has an interesting take on the posix standard, where it maximises the leftmost match not with respect to subexpressions of the regular expression pattern, but rather, with respect to capturing groups. In our work, we provide the first formalisation of Boost semantics, and we analyse the complexity of regular expression matching when using Boost semantics.},
address = {Cham},
- author = {M.~Berglund and Bester, Willem and van der Merwe, Brink},
+ author = {M.~Berglund and W.~Bester and van der M.~Brink},
booktitle = {Theoretical Aspects of Computing -- ICTAC 2018},
editor = {Fischer, Bernd and Uustalu, Tarmo},
isbn = {978-3-030-02508-3},
@@ -112,7 +102,7 @@
year = {2018}}
@inproceedings{Chen12,
- author = {Chen, Haiming and Yu, Sheng},
+ author = {H.~Chenand S.~Yu},
doi = {10.1007/978-3-642-27654-5_27},
month = {01},
pages = {343-356},
@@ -122,7 +112,7 @@
bdsk-url-1 = {https://doi.org/10.1007/978-3-642-27654-5_27}}
@article{Takayuki2019,
- author = {Takayuki Miyazaki and Yasuhiko Minamide},
+ author = {T.~Miyazaki and Y.~Minamide},
doi = {10.2197/ipsjjip.27.422},
journal = {Journal of Information Processing},
pages = {422-430},
@@ -166,7 +156,7 @@
@article{BERGLUND2022,
abstract = {Most modern regular expression matching libraries (one of the rare exceptions being Google's RE2) allow backreferences, operations which bind a substring to a variable, allowing it to be matched again verbatim. However, both real-world implementations and definitions in the literature use different syntactic restrictions and have differences in the semantics of the matching of backreferences. Our aim is to compare these various flavors by considering the classes of formal languages that each can describe, establishing, as a result, a hierarchy of language classes. Beyond the hierarchy itself, some complexity results are given, and as part of the effort on comparing language classes new pumping lemmas are established, old classes are extended to new ones, and several incidental results on the nature of these language classes are given.},
- author = {Martin Berglund and Brink {van der Merwe}},
+ author = {M.~Berglund and van der M.~Brink },
doi = {https://doi.org/10.1016/j.tcs.2022.10.041},
issn = {0304-3975},
journal = {Theoretical Computer Science},
@@ -179,7 +169,7 @@
@article{FREYDENBERGER20191,
abstract = {Most modern libraries for regular expression matching allow back-references (i.e., repetition operators) that substantially increase expressive power, but also lead to intractability. In order to find a better balance between expressiveness and tractability, we combine these with the notion of determinism for regular expressions used in XML DTDs and XML Schema. This includes the definition of a suitable automaton model, and a generalization of the Glushkov construction. We demonstrate that, compared to their non-deterministic superclass, these deterministic regular expressions with back-references have desirable algorithmic properties (i.e., efficiently solvable membership problem and some decidable problems in static analysis), while, at the same time, their expressive power exceeds that of deterministic regular expressions without back-references.},
- author = {Dominik D. Freydenberger and Markus L. Schmid},
+ author = {D.D.~Freydenberger and M.L.~Schmid},
doi = {https://doi.org/10.1016/j.jcss.2019.04.001},
issn = {0022-0000},
journal = {Journal of Computer and System Sciences},
@@ -195,7 +185,7 @@
@inproceedings{Frey2013,
address = {Dagstuhl, Germany},
annote = {Keywords: extended regular expressions, regex, decidability, non-recursive tradeoffs},
- author = {Dominik D. Freydenberger},
+ author = {D.D.~Freydenberger},
booktitle = {28th International Symposium on Theoretical Aspects of Computer Science (STACS 2011)},
doi = {10.4230/LIPIcs.STACS.2011.507},
editor = {Thomas Schwentick and Christoph D{\"u}rr},
@@ -235,7 +225,7 @@
@article{CAMPEANU2009Intersect,
abstract = {In this paper we revisit the semantics of extended regular expressions (regex), defined succinctly in the 90s [A.V. Aho, Algorithms for finding patterns in strings, in: Jan van Leeuwen (Ed.), Handbook of Theoretical Computer Science, in: Algorithms and Complexity, vol. A, Elsevier and MIT Press, 1990, pp. 255--300] and rigorously in 2003 by C{\^a}mpeanu, Salomaa and Yu [C. C{\^a}mpeanu, K. Salomaa, S. Yu, A formal study of practical regular expressions, IJFCS 14 (6) (2003) 1007--1018], when the authors reported an open problem, namely whether regex languages are closed under the intersection with regular languages. We give a positive answer; and for doing so, we propose a new class of machines --- regex automata systems (RAS) --- which are equivalent to regex. Among others, these machines provide a consistent and convenient method of implementing regex in practice. We also prove, as a consequence of this closure property, that several languages, such as the mirror language, the language of palindromes, and the language of balanced words are not regex languages.},
- author = {Cezar C{\^a}mpeanu and Nicolae Santean},
+ author = {C.~C{\^a}mpeanu and N.~Santean},
doi = {https://doi.org/10.1016/j.tcs.2009.02.022},
issn = {0304-3975},
journal = {Theoretical Computer Science},
@@ -286,42 +276,39 @@
year = {2011},
bdsk-url-1 = {https://doi.org/10.1145/2034773.2034801}}
-@article{10.1145/2034574.2034801,
- abstract = {We present a functional approach to parsing unrestricted context-free grammars based on Brzozowski's derivative of regular expressions. If we consider context-free grammars as recursive regular expressions, Brzozowski's equational theory extends without modification to context-free grammars (and it generalizes to parser combinators). The supporting actors in this story are three concepts familiar to functional programmers - laziness, memoization and fixed points; these allow Brzozowski's original equations to be transliterated into purely functional code in about 30 lines spread over three functions.Yet, this almost impossibly brief implementation has a drawback: its performance is sour - in both theory and practice. The culprit? Each derivative can double the size of a grammar, and with it, the cost of the next derivative.Fortunately, much of the new structure inflicted by the derivative is either dead on arrival, or it dies after the very next derivative. To eliminate it, we once again exploit laziness and memoization to transliterate an equational theory that prunes such debris into working code. Thanks to this compaction, parsing times become reasonable in practice.We equip the functional programmer with two equational theories that, when combined, make for an abbreviated understanding and implementation of a system for parsing context-free languages.},
- address = {New York, NY, USA},
- author = {Might, Matthew and Darais, David and Spiewak, Daniel},
- doi = {10.1145/2034574.2034801},
- issn = {0362-1340},
- issue_date = {September 2011},
- journal = {SIGPLAN Not.},
- keywords = {formal languages, derivative, context-free grammar, regular expressions, parsing, parser combinator},
- month = {sep},
- number = {9},
- numpages = {7},
- pages = {189--195},
- publisher = {Association for Computing Machinery},
- title = {Parsing with Derivatives: A Functional Pearl},
- url = {https://doi.org/10.1145/2034574.2034801},
- volume = {46},
- year = {2011},
- bdsk-url-1 = {https://doi.org/10.1145/2034574.2034801}}
-
@inproceedings{Adams2016,
title={On the complexity and performance of parsing with derivatives},
- author={Adams, Michael D and Hollenbeck, Celeste and Might, Matthew},
+ author={M.D.~Adams and C.~Hollenbeck and M.~MightMatthew},
booktitle={Proceedings of the 37th ACM SIGPLAN Conference on Programming Language Design and Implementation},
pages={224--236},
year={2016}
}
-
+@article{Huet1997,
+author = {G.~Huet},
+title = {The Zipper},
+year = {1997},
+issue_date = {September 1997},
+publisher = {Cambridge University Press},
+address = {USA},
+volume = {7},
+number = {5},
+issn = {0956-7968},
+url = {https://doi.org/10.1017/S0956796897002864},
+doi = {10.1017/S0956796897002864},
+abstract = {Almost every programmer has faced the problem of representing a tree together with a subtree that is the focus of attention, where that focus may move left, right, up or down the tree. The Zipper is Huet's nifty name for a nifty data structure which fulfills this need. I wish I had known of it when I faced this task, because the solution I came up with was not quite so efficient or elegant as the Zipper.},
+journal = {J. Funct. Program.},
+month = {sep},
+pages = {549–554},
+numpages = {6}
+}
@article{Darragh2020,
abstract = {Parsing with Derivatives (PwD) is an elegant approach to parsing context-free grammars (CFGs). It takes the equational theory behind Brzozowski's derivative for regular expressions and augments that theory with laziness, memoization, and fixed points. The result is a simple parser for arbitrary CFGs. Although recent work improved the performance of PwD, it remains inefficient due to the algorithm repeatedly traversing some parts of the grammar. In this functional pearl, we show how to avoid this inefficiency by suspending the state of the traversal in a zipper. When subsequent derivatives are taken, we can resume the traversal from where we left off without retraversing already traversed parts of the grammar. However, the original zipper is designed for use with trees, and we want to parse CFGs. CFGs can include shared regions, cycles, and choices between alternates, which makes them incompatible with the traditional tree model for zippers. This paper develops a generalization of zippers to properly handle these additional features. Just as PwD generalized Brzozowski's derivatives from regular expressions to CFGs, we generalize Huet's zippers from trees to CFGs. Abstract The resulting parsing algorithm is concise and efficient: it takes only 31 lines of OCaml code to implement the derivative function but performs 6,500 times faster than the original PwD and 3.24 times faster than the optimized implementation of PwD.},
address = {New York, NY, USA},
articleno = {108},
- author = {Darragh, Pierce and Adams, Michael D.},
+ author = {P.~Darragh and M.D.~Adams},
doi = {10.1145/3408990},
issue_date = {August 2020},
journal = {Proc. ACM Program. Lang.},
@@ -343,7 +330,7 @@
Then, I present a characterisation of LL(1) languages based on the concept of should-not-follow sets. I present an algorithm for parsing LL(1) languages with derivatives and zippers. I show a formal proof of the algorithm's correctness and prove its worst-case linear-time complexity. I show how the LL(1) parsing with derivatives and zippers algorithm corresponds to the traditional LL(1) parsing algorithm.
I then present SCALL1ON, a Scala parsing combinators library for LL(1) languages that incorporates the LL(1) parsing with derivatives and zippers algorithm. I present an expressive and familiar combinator-based interface for describing LL(1) languages. I present techniques that help precisely locate LL(1) conflicts in user code. I discuss several advantages of the parsing with derivatives approach within the context of a parsing library. I also present SCALL1ON's enumeration and pretty-printing features and discuss their implementation. Through a series of benchmarks, I demonstrate the good performance and practicality of the approach. Finally, I present how to adapt the LL(1) parsing with derivatives and zippers algorithm to support arbitrary context-free languages. I show how the adapted algorithm corresponds to general parsing algorithms, such as Earley's parsing algorithm.},
address = {Lausanne},
- author = {Edelmann, Romain},
+ author = {R.~Edelmann},
doi = {10.5075/epfl-thesis-7357},
institution = {IINFCOM},
pages = {246},
@@ -354,10 +341,22 @@
bdsk-url-1 = {http://infoscience.epfl.ch/record/287059},
bdsk-url-2 = {https://doi.org/10.5075/epfl-thesis-7357}}
+
+
+@inproceedings{Paulson2015,,
+author = {L.~Paulson},
+year = {2015},
+month = {05},
+pages = {},
+title = {A Formalisation of Finite Automata Using Hereditarily Finite Sets},
+isbn = {978-3-319-21400-9},
+doi = {10.1007/978-3-319-21401-6_15}
+}
+
@inproceedings{Zippy2020,
abstract = {In this paper, we present an efficient, functional, and formally verified parsing algorithm for LL(1) context-free expressions based on the concept of derivatives of formal languages. Parsing with derivatives is an elegant parsing technique, which, in the general case, suffers from cubic worst-case time complexity and slow performance in practice. We specialise the parsing with derivatives algorithm to LL(1) context-free expressions, where alternatives can be chosen given a single token of lookahead. We formalise the notion of LL(1) expressions and show how to efficiently check the LL(1) property. Next, we present a novel linear-time parsing with derivatives algorithm for LL(1) expressions operating on a zipper-inspired data structure. We prove the algorithm correct in Coq and present an implementation as a part of Scallion, a parser combinators framework in Scala with enumeration and pretty printing capabilities.},
address = {New York, NY, USA},
- author = {Edelmann, Romain and Hamza, Jad and Kun\v{c}ak, Viktor},
+ author = {R.~Edelmannand H.~Jad and K.~Viktor},
booktitle = {Proceedings of the 41st ACM SIGPLAN Conference on Programming Language Design and Implementation},
doi = {10.1145/3385412.3385992},
isbn = {9781450376136},
@@ -373,7 +372,7 @@
bdsk-url-1 = {https://doi.org/10.1145/3385412.3385992}}
@article{fowler2003,
- author = {Fowler, Glenn},
+ author = {G.~Fowler},
journal = {URL: https://web. archive. org/web/20050408073627/http://www. research. att. com/\~{} gsf/testregex/re-interpretation. html},
title = {An interpretation of the POSIX regex standard},
year = {2003}}
@@ -392,6 +391,25 @@
year = {1999}}
+@article{Reps1998,
+author = {T.~Reps},
+title = {“Maximal-Munch” Tokenization in Linear Time},
+year = {1998},
+issue_date = {March 1998},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+volume = {20},
+number = {2},
+issn = {0164-0925},
+url = {https://doi.org/10.1145/276393.276394},
+doi = {10.1145/276393.276394},
+abstract = {The lexical-analysis (or scanning) phase of a compiler attempts to partition an input string into a sequence of tokens. The convention in most languages is that the input is scanned left to right, and each token identified is a “maximal munch” of the remaining input—the longest prefix of the remaining input that is a token of the language. Although most of the standard compiler textbooks present a way to perform maximal-munch tokenization, the algorithm they describe is one that, for certain sets of token definitions, can cause the scanner to exhibit quadratic behavior in the worst case. In the article, we show that maximal-munch tokenization can always be performed in time linear in the size of the input.},
+journal = {ACM Trans. Program. Lang. Syst.},
+month = {mar},
+pages = {259–273},
+numpages = {15},
+keywords = {backtracking, tokenization, tabulation, dynamic programming, memoization}
+}
@inproceedings{Bro,
author = {V.~Paxson},
@@ -407,10 +425,77 @@
series = {SSYM'98}
}
+@INPROCEEDINGS{Sidhu2001,
+ author={Sidhu, R. and Prasanna, V.K.},
+ booktitle={The 9th Annual IEEE Symposium on Field-Programmable Custom Computing Machines (FCCM'01)},
+ title={Fast Regular Expression Matching Using FPGAs},
+ year={2001},
+ volume={},
+ number={},
+ pages={227-238},
+ doi={}
+}
+@article{Turonova2020,
+author = {L.~Turo\v{n}ov\'{a} and L.~Hol\'{\i}k and O.~Leng\'{a}l and O.~Saarikivi and M.~Veanes and T.~Vojnar },
+title = {Regex Matching with Counting-Set Automata},
+year = {2020},
+issue_date = {November 2020},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+volume = {4},
+number = {OOPSLA},
+url = {https://doi.org/10.1145/3428286},
+doi = {10.1145/3428286},
+abstract = {We propose a solution to the problem of efficient matching regular expressions (regexes) with bounded repetition, such as (ab){1,100}, using deterministic automata. For this, we introduce novel counting-set automata (CsAs), automata with registers that can hold sets of bounded integers and can be manipulated by a limited portfolio of constant-time operations. We present an algorithm that compiles a large sub-class of regexes to deterministic CsAs. This includes (1) a novel Antimirov-style translation of regexes with counting to counting automata (CAs), nondeterministic automata with bounded counters, and (2) our main technical contribution, a determinization of CAs that outputs CsAs. The main advantage of this workflow is that the size of the produced CsAs does not depend on the repetition bounds used in the regex (while the size of the DFA is exponential to them). Our experimental results confirm that deterministic CsAs produced from practical regexes with repetition are indeed vastly smaller than the corresponding DFAs. More importantly, our prototype matcher based on CsA simulation handles practical regexes with repetition regardless of sizes of counter bounds. It easily copes with regexes with repetition where state-of-the-art matchers struggle.},
+journal = {Proc. ACM Program. Lang.},
+month = {nov},
+articleno = {218},
+numpages = {30},
+keywords = {counting-set automata, counting automata, bounded repetition, ReDos, Antimirov's derivatives, regular expression matching, determinization}
+}
+
+@inproceedings{Kumar2006,
+author = {S.~Kumar and S.~Dharmapurikar and F.~Yu and P.~Crowley and J.~Turner},
+title = {Algorithms to Accelerate Multiple Regular Expressions Matching for Deep Packet Inspection},
+year = {2006},
+isbn = {1595933085},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+url = {https://doi.org/10.1145/1159913.1159952},
+doi = {10.1145/1159913.1159952},
+abstract = {There is a growing demand for network devices capable of examining the content of data packets in order to improve network security and provide application-specific services. Most high performance systems that perform deep packet inspection implement simple string matching algorithms to match packets against a large, but finite set of strings. owever, there is growing interest in the use of regular expression-based pattern matching, since regular expressions offer superior expressive power and flexibility. Deterministic finite automata (DFA) representations are typically used to implement regular expressions. However, DFA representations of regular expression sets arising in network applications require large amounts of memory, limiting their practical application.In this paper, we introduce a new representation for regular expressions, called the Delayed Input DFA (D2FA), which substantially reduces space equirements as compared to a DFA. A D2FA is constructed by transforming a DFA via incrementally replacing several transitions of the automaton with a single default transition. Our approach dramatically reduces the number of distinct transitions between states. For a collection of regular expressions drawn from current commercial and academic systems, a D2FA representation reduces transitions by more than 95%. Given the substantially reduced space equirements, we describe an efficient architecture that can perform deep packet inspection at multi-gigabit rates. Our architecture uses multiple on-chip memories in such a way that each remains uniformly occupied and accessed over a short duration, thus effectively distributing the load and enabling high throughput. Our architecture can provide ostffective packet content scanning at OC-192 rates with memory requirements that are consistent with current ASIC technology.},
+booktitle = {Proceedings of the 2006 Conference on Applications, Technologies, Architectures, and Protocols for Computer Communications},
+pages = {339–350},
+numpages = {12},
+keywords = {regular expressions, DFA, deep packet inspection},
+location = {Pisa, Italy},
+series = {SIGCOMM '06}
+}
+
+
+
+@inproceedings{Becchi2007,
+author = {M.~Becchi and P.~Crowley},
+title = {An Improved Algorithm to Accelerate Regular Expression Evaluation},
+year = {2007},
+isbn = {9781595939456},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+url = {https://doi.org/10.1145/1323548.1323573},
+doi = {10.1145/1323548.1323573},
+abstract = {Modern network intrusion detection systems need to perform regular expression matching at line rate in order to detect the occurrence of critical patterns in packet payloads. While deterministic finite automata (DFAs) allow this operation to be performed in linear time, they may exhibit prohibitive memory requirements. In [9], Kumar et al. propose Delayed Input DFAs (D2FAs), which provide a trade-off between the memory requirements of the compressed DFA and the number of states visited for each character processed, which corresponds directly to the memory bandwidth required to evaluate regular expressions.In this paper we introduce a general compression technique that results in at most 2N state traversals when processing a string of length N. In comparison to the D2FA approach, our technique achieves comparable levels of compression, with lower provable bounds on memory bandwidth (or greater compression for a given bandwidth bound). Moreover, our proposed algorithm has lower complexity, is suitable for scenarios where a compressed DFA needs to be dynamically built or updated, and fosters locality in the traversal process. Finally, we also describe a novel alphabet reduction scheme for DFA-based structures that can yield further dramatic reductions in data structure size.},
+booktitle = {Proceedings of the 3rd ACM/IEEE Symposium on Architecture for Networking and Communications Systems},
+pages = {145–154},
+numpages = {10},
+keywords = {regular expressions, DFA, deep packet inspection},
+location = {Orlando, Florida, USA},
+series = {ANCS '07}
+}
+
@inproceedings{Becchi08,
- author = {Becchi, Michela and Crowley, Patrick},
+ author = {M.~Becchi and , Patrick},
doi = {10.1145/1544012.1544037},
month = {01},
pages = {25},
@@ -444,7 +529,7 @@
@inproceedings{Nipkow1998,
- author = {Tobias Nipkow},
+ author = {T.~Nipkow},
editor = {Jim Grundy and
Malcolm C. Newey},
title = {Verified Lexical Analysis},