--- a/ChengsongTanPhdThesis/example.bib Thu Nov 17 23:13:57 2022 +0000
+++ b/ChengsongTanPhdThesis/example.bib Mon Nov 21 23:56:15 2022 +0000
@@ -1,200 +1,217 @@
%% This BibTeX bibliography file was created using BibDesk.
%% https://bibdesk.sourceforge.io/
-%% Created for CS TAN at 2022-05-23 18:43:50 +0100
+%% Created for CS TAN at 2022-11-20 17:26:32 +0000
%% Saved with string encoding Unicode (UTF-8)
-
-
-@article{Murugesan2014,
- author = {N.~Murugesan and O.~V.~Shanmuga Sundaram},
- title = {{S}ome {P}roperties of {B}rzozowski {D}erivatives of {R}egular {E}xpressions},
- journal = {International Journal of Computer Trends and Technology},
- volume = {13},
- number = {1},
- year = {2014},
- url = {http://arxiv.org/abs/1407.5902},
- pages = {29--33}
-}
-
-@PhdThesis{Ausaf,
- author = {F.~Ausaf},
- title = {{V}erified {L}exing and {P}arsing},
- school = {King's College London},
- year = {2018}
-}
-
-%% POSIX specification------------------------
-@InProceedings{Okui10,
-author=" S.~Okui
-and T.~Suzuki",
-editor="Domaratzki, Michael
-and Salomaa, Kai",
-title="Disambiguation in Regular Expression Matching via Position Automata with Augmented Transitions",
-booktitle="Implementation and Application of Automata",
-year="2011",
-publisher="Springer Berlin Heidelberg",
-address="Berlin, Heidelberg",
-pages="231--240",
-abstract="This paper offers a new efficient regular expression matching algorithm which follows the POSIX-type leftmost-longest rule. The algorithm basically emulates the subset construction without backtracking, so that its computational cost even in the worst case does not explode exponentially; the time complexity of the algorithm is O(mn(n{\thinspace}+{\thinspace}c)), where m is the length of a given input string, n the number of occurrences of the most frequently used letter in a given regular expression and c the number of subexpressions to be used for capturing substrings. A formalization of the leftmost-longest semantics by using parse trees is also discussed.",
-isbn="978-3-642-18098-9"
-}
-
-%% POSIX specification------------------------
-
-%% Brzozowski ders------------------------
-@article{Berglund14,
-author = {M.~Berglund, F.~Drewes and B.~Van Der Merwe},
-year = {2014},
-month = {05},
-pages = {},
-title = {Analyzing Catastrophic Backtracking Behavior in Practical Regular Expression Matching},
-volume = {151},
-journal = {Electronic Proceedings in Theoretical Computer Science},
-doi = {10.4204/EPTCS.151.7}
-}
-
-@InProceedings{Berglund18,
-author="M.~Berglund
-and Bester, Willem
-and van der Merwe, Brink",
-editor="Fischer, Bernd
-and Uustalu, Tarmo",
-title="Formalising Boost POSIX Regular Expression Matching",
-booktitle="Theoretical Aspects of Computing -- ICTAC 2018",
-year="2018",
-publisher="Springer International Publishing",
-address="Cham",
-pages="99--115",
-abstract="Whereas Perl-compatible regular expression matchers typically exhibit some variation of leftmost-greedy semantics, those conforming to the posix standard are prescribed leftmost-longest semantics. However, the posix standard leaves some room for interpretation, and Fowler and Kuklewicz have done experimental work to confirm differences between various posix matchers. The Boost library has an interesting take on the posix standard, where it maximises the leftmost match not with respect to subexpressions of the regular expression pattern, but rather, with respect to capturing groups. In our work, we provide the first formalisation of Boost semantics, and we analyse the complexity of regular expression matching when using Boost semantics.",
-isbn="978-3-030-02508-3"
+@inproceedings{Doczkal2013,
+author = {Doczkal, Christian and Kaiser, Jan-Oliver and Smolka, Gert},
+title = {A Constructive Theory of Regular Languages in Coq},
+year = {2013},
+isbn = {9783319035444},
+publisher = {Springer-Verlag},
+address = {Berlin, Heidelberg},
+url = {https://doi.org/10.1007/978-3-319-03545-1_6},
+doi = {10.1007/978-3-319-03545-1_6},
+abstract = {We present a formal constructive theory of regular languages consisting of about 1400 lines of Coq/Ssreflect. As representations we consider regular expressions, deterministic and nondeterministic automata, and Myhill and Nerode partitions. We construct computable functions translating between these representations and show that equivalence of representations is decidable. We also establish the usual closure properties, give a minimization algorithm for DFAs, and prove that minimal DFAs are unique up to state renaming. Our development profits much from Ssreflect's support for finite types and graphs.},
+booktitle = {Proceedings of the Third International Conference on Certified Programs and Proofs - Volume 8307},
+pages = {82–97},
+numpages = {16},
+keywords = {finite automata, regular expressions, Myhill-Nerode, Ssreflect, Coq, regular languages}
}
-@inproceedings{Chen12,
-author = {Chen, Haiming and Yu, Sheng},
-year = {2012},
-month = {01},
-pages = {343-356},
-title = {Derivatives of Regular Expressions and an Application},
-volume = {7160},
-doi = {10.1007/978-3-642-27654-5_27}
+
+@article{Krauss2012,
+ author={Alexander Krauss and Tobias Nipkow},
+title={Proof Pearl: Regular Expression Equivalence and Relation Algebra},
+journal={J. Automated Reasoning},volume=49,pages={95--106},year=2012,note={published online March 2011}}
+
+
+
+@article{kleene1956,
+ title={Representation of events in nerve nets and finite automata},
+ author={S.C.~Kleene and others},
+ journal={Automata studies},
+ volume={34},
+ pages={3--41},
+ year={1956},
+ publisher={Princeton, NJ}
+}
+
+@inproceedings{Sailesh2006,
+author = {K.~Sailesh and D.~Sarang and Y.~Fang and C.~Patrick and T.~Jonathan},
+title = {Algorithms to Accelerate Multiple Regular Expressions Matching for Deep Packet Inspection},
+year = {2006},
+isbn = {1595933085},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+url = {https://doi.org/10.1145/1159913.1159952},
+doi = {10.1145/1159913.1159952},
+abstract = {There is a growing demand for network devices capable of examining the content of data packets in order to improve network security and provide application-specific services. Most high performance systems that perform deep packet inspection implement simple string matching algorithms to match packets against a large, but finite set of strings. owever, there is growing interest in the use of regular expression-based pattern matching, since regular expressions offer superior expressive power and flexibility. Deterministic finite automata (DFA) representations are typically used to implement regular expressions. However, DFA representations of regular expression sets arising in network applications require large amounts of memory, limiting their practical application.In this paper, we introduce a new representation for regular expressions, called the Delayed Input DFA (D2FA), which substantially reduces space equirements as compared to a DFA. A D2FA is constructed by transforming a DFA via incrementally replacing several transitions of the automaton with a single default transition. Our approach dramatically reduces the number of distinct transitions between states. For a collection of regular expressions drawn from current commercial and academic systems, a D2FA representation reduces transitions by more than 95%. Given the substantially reduced space equirements, we describe an efficient architecture that can perform deep packet inspection at multi-gigabit rates. Our architecture uses multiple on-chip memories in such a way that each remains uniformly occupied and accessed over a short duration, thus effectively distributing the load and enabling high throughput. Our architecture can provide ostffective packet content scanning at OC-192 rates with memory requirements that are consistent with current ASIC technology.},
+booktitle = {Proceedings of the 2006 Conference on Applications, Technologies, Architectures, and Protocols for Computer Communications},
+pages = {339–350},
+numpages = {12},
+keywords = {deep packet inspection, regular expressions, DFA},
+location = {Pisa, Italy},
+series = {SIGCOMM '06}
}
-%% Brzozowski ders------------------------
-%@article{Murugesan2014,
-% author = {N.~Murugesan and O.~V.~Shanmuga Sundaram},
-% title = {{S}ome {P}roperties of {B}rzozowski {D}erivatives of {R}egular {E}xpressions},
-% journal = {International Journal of Computer Trends and Technology},
-% volume = {13},
-% number = {1},
-% year = {2014},
-% url = {http://arxiv.org/abs/1407.5902},
-% pages = {29--33}
-%}
+@article{Murugesan2014,
+ author = {N.~Murugesan and O.~V.~Shanmuga Sundaram},
+ journal = {International Journal of Computer Trends and Technology},
+ number = {1},
+ pages = {29--33},
+ title = {{S}ome {P}roperties of {B}rzozowski {D}erivatives of {R}egular {E}xpressions},
+ url = {http://arxiv.org/abs/1407.5902},
+ volume = {13},
+ year = {2014},
+ bdsk-url-1 = {http://arxiv.org/abs/1407.5902}}
+
+@phdthesis{Ausaf,
+ author = {F.~Ausaf},
+ school = {King's College London},
+ title = {{V}erified {L}exing and {P}arsing},
+ year = {2018}}
+
+@inproceedings{Okui10,
+ abstract = {This paper offers a new efficient regular expression matching algorithm which follows the POSIX-type leftmost-longest rule. The algorithm basically emulates the subset construction without backtracking, so that its computational cost even in the worst case does not explode exponentially; the time complexity of the algorithm is O(mn(n{\thinspace}+{\thinspace}c)), where m is the length of a given input string, n the number of occurrences of the most frequently used letter in a given regular expression and c the number of subexpressions to be used for capturing substrings. A formalization of the leftmost-longest semantics by using parse trees is also discussed.},
+ address = {Berlin, Heidelberg},
+ author = {S.~Okui and T.~Suzuki},
+ booktitle = {Implementation and Application of Automata},
+ editor = {Domaratzki, Michael and Salomaa, Kai},
+ isbn = {978-3-642-18098-9},
+ pages = {231--240},
+ publisher = {Springer Berlin Heidelberg},
+ title = {Disambiguation in Regular Expression Matching via Position Automata with Augmented Transitions},
+ year = {2011}}
-%% look-aheads------------------------
+@article{Berglund14,
+ author = {M.~Berglund, F.~Drewes and B.~Van Der Merwe},
+ doi = {10.4204/EPTCS.151.7},
+ journal = {Electronic Proceedings in Theoretical Computer Science},
+ month = {05},
+ title = {Analyzing Catastrophic Backtracking Behavior in Practical Regular Expression Matching},
+ volume = {151},
+ year = {2014},
+ bdsk-url-1 = {https://doi.org/10.4204/EPTCS.151.7}}
+
+@inproceedings{Berglund18,
+ abstract = {Whereas Perl-compatible regular expression matchers typically exhibit some variation of leftmost-greedy semantics, those conforming to the posix standard are prescribed leftmost-longest semantics. However, the posix standard leaves some room for interpretation, and Fowler and Kuklewicz have done experimental work to confirm differences between various posix matchers. The Boost library has an interesting take on the posix standard, where it maximises the leftmost match not with respect to subexpressions of the regular expression pattern, but rather, with respect to capturing groups. In our work, we provide the first formalisation of Boost semantics, and we analyse the complexity of regular expression matching when using Boost semantics.},
+ address = {Cham},
+ author = {M.~Berglund and Bester, Willem and van der Merwe, Brink},
+ booktitle = {Theoretical Aspects of Computing -- ICTAC 2018},
+ editor = {Fischer, Bernd and Uustalu, Tarmo},
+ isbn = {978-3-030-02508-3},
+ pages = {99--115},
+ publisher = {Springer International Publishing},
+ title = {Formalising Boost POSIX Regular Expression Matching},
+ year = {2018}}
+
+@inproceedings{Chen12,
+ author = {Chen, Haiming and Yu, Sheng},
+ doi = {10.1007/978-3-642-27654-5_27},
+ month = {01},
+ pages = {343-356},
+ title = {Derivatives of Regular Expressions and an Application},
+ volume = {7160},
+ year = {2012},
+ bdsk-url-1 = {https://doi.org/10.1007/978-3-642-27654-5_27}}
+
@article{Takayuki2019,
- title={Derivatives of Regular Expressions with Lookahead},
- author={Takayuki Miyazaki and Yasuhiko Minamide},
- journal={Journal of Information Processing},
- volume={27},
- number={ },
- pages={422-430},
- year={2019},
- doi={10.2197/ipsjjip.27.422}
-}
-
-%% look-aheads------------------------
-
-
+ author = {Takayuki Miyazaki and Yasuhiko Minamide},
+ doi = {10.2197/ipsjjip.27.422},
+ journal = {Journal of Information Processing},
+ pages = {422-430},
+ title = {Derivatives of Regular Expressions with Lookahead},
+ volume = {27},
+ year = {2019},
+ bdsk-url-1 = {https://doi.org/10.2197/ipsjjip.27.422}}
-%% -------------------------------------
-%% back-references--------------------
@article{FERNAU2015287,
-title = {Pattern matching with variables: A multivariate complexity analysis},
-journal = {Information and Computation},
-volume = {242},
-pages = {287-305},
-year = {2015},
-issn = {0890-5401},
-doi = {https://doi.org/10.1016/j.ic.2015.03.006},
-url = {https://www.sciencedirect.com/science/article/pii/S0890540115000218},
-author = {H.~Fernau and M.L.~Schmid},
-keywords = {Parameterised pattern matching, Function matching, NP-completeness, Membership problem for pattern languages, Morphisms},
-abstract = {A pattern α, i.e., a string that contains variables and terminals, matches a terminal word w if w can be obtained by uniformly substituting the variables of α by terminal words. Deciding whether a given terminal word matches a given pattern is NP-complete and this holds for several natural variants of the problem that result from whether or not variables can be erased, whether or not the patterns are required to be terminal-free or whether or not the mapping of variables to terminal words must be injective. We consider numerous parameters of this problem (i.e., number of variables, length of w, length of the words substituted for variables, number of occurrences per variable, cardinality of the terminal alphabet) and for all possible combinations of the parameters (and variants described above), we answer the question whether or not the problem is still NP-complete if these parameters are bounded by constants.}
-}
+ abstract = {A pattern α, i.e., a string that contains variables and terminals, matches a terminal word w if w can be obtained by uniformly substituting the variables of α by terminal words. Deciding whether a given terminal word matches a given pattern is NP-complete and this holds for several natural variants of the problem that result from whether or not variables can be erased, whether or not the patterns are required to be terminal-free or whether or not the mapping of variables to terminal words must be injective. We consider numerous parameters of this problem (i.e., number of variables, length of w, length of the words substituted for variables, number of occurrences per variable, cardinality of the terminal alphabet) and for all possible combinations of the parameters (and variants described above), we answer the question whether or not the problem is still NP-complete if these parameters are bounded by constants.},
+ author = {H.~Fernau and M.L.~Schmid},
+ doi = {https://doi.org/10.1016/j.ic.2015.03.006},
+ issn = {0890-5401},
+ journal = {Information and Computation},
+ keywords = {Parameterised pattern matching, Function matching, NP-completeness, Membership problem for pattern languages, Morphisms},
+ pages = {287-305},
+ title = {Pattern matching with variables: A multivariate complexity analysis},
+ url = {https://www.sciencedirect.com/science/article/pii/S0890540115000218},
+ volume = {242},
+ year = {2015},
+ bdsk-url-1 = {https://www.sciencedirect.com/science/article/pii/S0890540115000218},
+ bdsk-url-2 = {https://doi.org/10.1016/j.ic.2015.03.006}}
@inproceedings{Schmid2012,
-author = {M.L.~Schmid},
-title = {Inside the Class of REGEX Languages},
-year = {2012},
-isbn = {9783642316524},
-publisher = {Springer-Verlag},
-address = {Berlin, Heidelberg},
-url = {https://doi.org/10.1007/978-3-642-31653-1_8},
-doi = {10.1007/978-3-642-31653-1_8},
-abstract = {We study different possibilities of combining the concept of homomorphic replacement with regular expressions in order to investigate the class of languages given by extended regular expressions with backreferences (REGEX). It is shown in which regard existing and natural ways to do this fail to reach the expressive power of REGEX. Furthermore, the complexity of the membership problem for REGEX with a bounded number of backreferences is considered.},
-booktitle = {Proceedings of the 16th International Conference on Developments in Language Theory},
-pages = {73–84},
-numpages = {12},
-keywords = {extended regular expressions, pattern languages, REGEX, pattern expressions, homomorphic replacement},
-location = {Taipei, Taiwan},
-series = {DLT'12}
-}
-
-
+ abstract = {We study different possibilities of combining the concept of homomorphic replacement with regular expressions in order to investigate the class of languages given by extended regular expressions with backreferences (REGEX). It is shown in which regard existing and natural ways to do this fail to reach the expressive power of REGEX. Furthermore, the complexity of the membership problem for REGEX with a bounded number of backreferences is considered.},
+ address = {Berlin, Heidelberg},
+ author = {M.L.~Schmid},
+ booktitle = {Proceedings of the 16th International Conference on Developments in Language Theory},
+ doi = {10.1007/978-3-642-31653-1_8},
+ isbn = {9783642316524},
+ keywords = {extended regular expressions, pattern languages, REGEX, pattern expressions, homomorphic replacement},
+ location = {Taipei, Taiwan},
+ numpages = {12},
+ pages = {73--84},
+ publisher = {Springer-Verlag},
+ series = {DLT'12},
+ title = {Inside the Class of REGEX Languages},
+ url = {https://doi.org/10.1007/978-3-642-31653-1_8},
+ year = {2012},
+ bdsk-url-1 = {https://doi.org/10.1007/978-3-642-31653-1_8}}
@article{BERGLUND2022,
-title = {Re-examining regular expressions with backreferences},
-journal = {Theoretical Computer Science},
-year = {2022},
-issn = {0304-3975},
-doi = {https://doi.org/10.1016/j.tcs.2022.10.041},
-url = {https://www.sciencedirect.com/science/article/pii/S0304397522006570},
-author = {Martin Berglund and Brink {van der Merwe}},
-keywords = {Regular expressions, Backreferences},
-abstract = {Most modern regular expression matching libraries (one of the rare exceptions being Google's RE2) allow backreferences, operations which bind a substring to a variable, allowing it to be matched again verbatim. However, both real-world implementations and definitions in the literature use different syntactic restrictions and have differences in the semantics of the matching of backreferences. Our aim is to compare these various flavors by considering the classes of formal languages that each can describe, establishing, as a result, a hierarchy of language classes. Beyond the hierarchy itself, some complexity results are given, and as part of the effort on comparing language classes new pumping lemmas are established, old classes are extended to new ones, and several incidental results on the nature of these language classes are given.}
-}
+ abstract = {Most modern regular expression matching libraries (one of the rare exceptions being Google's RE2) allow backreferences, operations which bind a substring to a variable, allowing it to be matched again verbatim. However, both real-world implementations and definitions in the literature use different syntactic restrictions and have differences in the semantics of the matching of backreferences. Our aim is to compare these various flavors by considering the classes of formal languages that each can describe, establishing, as a result, a hierarchy of language classes. Beyond the hierarchy itself, some complexity results are given, and as part of the effort on comparing language classes new pumping lemmas are established, old classes are extended to new ones, and several incidental results on the nature of these language classes are given.},
+ author = {Martin Berglund and Brink {van der Merwe}},
+ doi = {https://doi.org/10.1016/j.tcs.2022.10.041},
+ issn = {0304-3975},
+ journal = {Theoretical Computer Science},
+ keywords = {Regular expressions, Backreferences},
+ title = {Re-examining regular expressions with backreferences},
+ url = {https://www.sciencedirect.com/science/article/pii/S0304397522006570},
+ year = {2022},
+ bdsk-url-1 = {https://www.sciencedirect.com/science/article/pii/S0304397522006570},
+ bdsk-url-2 = {https://doi.org/10.1016/j.tcs.2022.10.041}}
@article{FREYDENBERGER20191,
-title = {Deterministic regular expressions with back-references},
-journal = {Journal of Computer and System Sciences},
-volume = {105},
-pages = {1-39},
-year = {2019},
-issn = {0022-0000},
-doi = {https://doi.org/10.1016/j.jcss.2019.04.001},
-url = {https://www.sciencedirect.com/science/article/pii/S0022000018301818},
-author = {Dominik D. Freydenberger and Markus L. Schmid},
-keywords = {Deterministic regular expression, Regex, Glushkov automaton},
-abstract = {Most modern libraries for regular expression matching allow back-references (i.e., repetition operators) that substantially increase expressive power, but also lead to intractability. In order to find a better balance between expressiveness and tractability, we combine these with the notion of determinism for regular expressions used in XML DTDs and XML Schema. This includes the definition of a suitable automaton model, and a generalization of the Glushkov construction. We demonstrate that, compared to their non-deterministic superclass, these deterministic regular expressions with back-references have desirable algorithmic properties (i.e., efficiently solvable membership problem and some decidable problems in static analysis), while, at the same time, their expressive power exceeds that of deterministic regular expressions without back-references.}
-}
-@InProceedings{Frey2013,
- author = {Dominik D. Freydenberger},
- title = {{Extended Regular Expressions: Succinctness and Decidability}},
- booktitle = {28th International Symposium on Theoretical Aspects of Computer Science (STACS 2011) },
- pages = {507--518},
- series = {Leibniz International Proceedings in Informatics (LIPIcs)},
- ISBN = {978-3-939897-25-5},
- ISSN = {1868-8969},
- year = {2011},
- volume = {9},
- editor = {Thomas Schwentick and Christoph D{\"u}rr},
- publisher = {Schloss Dagstuhl--Leibniz-Zentrum fuer Informatik},
- address = {Dagstuhl, Germany},
- URL = {http://drops.dagstuhl.de/opus/volltexte/2011/3039},
- URN = {urn:nbn:de:0030-drops-30396},
- doi = {10.4230/LIPIcs.STACS.2011.507},
- annote = {Keywords: extended regular expressions, regex, decidability, non-recursive tradeoffs}
-}
+ abstract = {Most modern libraries for regular expression matching allow back-references (i.e., repetition operators) that substantially increase expressive power, but also lead to intractability. In order to find a better balance between expressiveness and tractability, we combine these with the notion of determinism for regular expressions used in XML DTDs and XML Schema. This includes the definition of a suitable automaton model, and a generalization of the Glushkov construction. We demonstrate that, compared to their non-deterministic superclass, these deterministic regular expressions with back-references have desirable algorithmic properties (i.e., efficiently solvable membership problem and some decidable problems in static analysis), while, at the same time, their expressive power exceeds that of deterministic regular expressions without back-references.},
+ author = {Dominik D. Freydenberger and Markus L. Schmid},
+ doi = {https://doi.org/10.1016/j.jcss.2019.04.001},
+ issn = {0022-0000},
+ journal = {Journal of Computer and System Sciences},
+ keywords = {Deterministic regular expression, Regex, Glushkov automaton},
+ pages = {1-39},
+ title = {Deterministic regular expressions with back-references},
+ url = {https://www.sciencedirect.com/science/article/pii/S0022000018301818},
+ volume = {105},
+ year = {2019},
+ bdsk-url-1 = {https://www.sciencedirect.com/science/article/pii/S0022000018301818},
+ bdsk-url-2 = {https://doi.org/10.1016/j.jcss.2019.04.001}}
-
+@inproceedings{Frey2013,
+ address = {Dagstuhl, Germany},
+ annote = {Keywords: extended regular expressions, regex, decidability, non-recursive tradeoffs},
+ author = {Dominik D. Freydenberger},
+ booktitle = {28th International Symposium on Theoretical Aspects of Computer Science (STACS 2011)},
+ doi = {10.4230/LIPIcs.STACS.2011.507},
+ editor = {Thomas Schwentick and Christoph D{\"u}rr},
+ isbn = {978-3-939897-25-5},
+ issn = {1868-8969},
+ pages = {507--518},
+ publisher = {Schloss Dagstuhl--Leibniz-Zentrum fuer Informatik},
+ series = {Leibniz International Proceedings in Informatics (LIPIcs)},
+ title = {{Extended Regular Expressions: Succinctness and Decidability}},
+ url = {http://drops.dagstuhl.de/opus/volltexte/2011/3039},
+ urn = {urn:nbn:de:0030-drops-30396},
+ volume = {9},
+ year = {2011},
+ bdsk-url-1 = {http://drops.dagstuhl.de/opus/volltexte/2011/3039},
+ bdsk-url-2 = {https://doi.org/10.4230/LIPIcs.STACS.2011.507}}
-%% -------------------------- campeanu related
@article{campeanu2003formal,
author = {C.~C{\^a}mpeanu and K.~Salomaa and S.~Yu},
journal = {International Journal of Foundations of Computer Science},
@@ -206,15 +223,15 @@
year = {2003}}
@article{campeanu2009patterns,
-author = {C.~C{\^a}mpeanu and N.~Santean},
-year = {2009},
-month = {05},
-pages = {193-207},
-title = {On the closure of pattern expressions languages under intersection with regular languages},
-volume = {46},
-journal = {Acta Inf.},
-doi = {10.1007/s00236-009-0090-y}
-}
+ author = {C.~C{\^a}mpeanu and N.~Santean},
+ doi = {10.1007/s00236-009-0090-y},
+ journal = {Acta Inf.},
+ month = {05},
+ pages = {193-207},
+ title = {On the closure of pattern expressions languages under intersection with regular languages},
+ volume = {46},
+ year = {2009},
+ bdsk-url-1 = {https://doi.org/10.1007/s00236-009-0090-y}}
@article{CAMPEANU2009Intersect,
abstract = {In this paper we revisit the semantics of extended regular expressions (regex), defined succinctly in the 90s [A.V. Aho, Algorithms for finding patterns in strings, in: Jan van Leeuwen (Ed.), Handbook of Theoretical Computer Science, in: Algorithms and Complexity, vol. A, Elsevier and MIT Press, 1990, pp. 255--300] and rigorously in 2003 by C{\^a}mpeanu, Salomaa and Yu [C. C{\^a}mpeanu, K. Salomaa, S. Yu, A formal study of practical regular expressions, IJFCS 14 (6) (2003) 1007--1018], when the authors reported an open problem, namely whether regex languages are closed under the intersection with regular languages. We give a positive answer; and for doing so, we propose a new class of machines --- regex automata systems (RAS) --- which are equivalent to regex. Among others, these machines provide a consistent and convenient method of implementing regex in practice. We also prove, as a consequence of this closure property, that several languages, such as the mirror language, the language of palindromes, and the language of balanced words are not regex languages.},
@@ -233,163 +250,236 @@
bdsk-url-1 = {https://www.sciencedirect.com/science/article/pii/S0304397509001789},
bdsk-url-2 = {https://doi.org/10.1016/j.tcs.2009.02.022}}
+@incollection{Aho1990,
+ abstract = {Publisher Summary
+This chapter discusses the algorithms for solving string-matching problems that have proven useful for text-editing and text-processing applications. String pattern matching is an important problem that occurs in many areas of science and information processing. In computing, it occurs naturally as part of data processing, text editing, term rewriting, lexical analysis, and information retrieval. Many text editors and programming languages have facilities for matching strings. In biology, string-matching problems arise in the analysis of nucleic acids and protein sequences, and in the investigation of molecular phylogeny. String matching is also one of the central and most widely studied problems in theoretical computer science. The simplest form of the problem is to locate an occurrence of a keyword as a substring in a sequence of characters, which is called the input string. For example, the input string queueing contains the keyword ueuei as a substring. Even for this problem, several innovative, theoretically interesting algorithms have been devised that run significantly faster than the obvious brute-force method.},
+ address = {Amsterdam},
+ author = {A.V.~Aho},
+ booktitle = {Algorithms and Complexity},
+ doi = {https://doi.org/10.1016/B978-0-444-88071-0.50010-2},
+ editor = {JAN {VAN LEEUWEN}},
+ isbn = {978-0-444-88071-0},
+ pages = {255-300},
+ publisher = {Elsevier},
+ series = {Handbook of Theoretical Computer Science},
+ title = {CHAPTER 5 - Algorithms for Finding Patterns in Strings},
+ url = {https://www.sciencedirect.com/science/article/pii/B9780444880710500102},
+ year = {1990},
+ bdsk-url-1 = {https://www.sciencedirect.com/science/article/pii/B9780444880710500102},
+ bdsk-url-2 = {https://doi.org/10.1016/B978-0-444-88071-0.50010-2}}
-@incollection{Aho1990,
-title = {CHAPTER 5 - Algorithms for Finding Patterns in Strings},
-editor = {JAN {VAN LEEUWEN}},
-booktitle = {Algorithms and Complexity},
-publisher = {Elsevier},
-address = {Amsterdam},
-pages = {255-300},
-year = {1990},
-series = {Handbook of Theoretical Computer Science},
-isbn = {978-0-444-88071-0},
-doi = {https://doi.org/10.1016/B978-0-444-88071-0.50010-2},
-url = {https://www.sciencedirect.com/science/article/pii/B9780444880710500102},
-author = {A.V.~Aho},
-abstract = {Publisher Summary
-This chapter discusses the algorithms for solving string-matching problems that have proven useful for text-editing and text-processing applications. String pattern matching is an important problem that occurs in many areas of science and information processing. In computing, it occurs naturally as part of data processing, text editing, term rewriting, lexical analysis, and information retrieval. Many text editors and programming languages have facilities for matching strings. In biology, string-matching problems arise in the analysis of nucleic acids and protein sequences, and in the investigation of molecular phylogeny. String matching is also one of the central and most widely studied problems in theoretical computer science. The simplest form of the problem is to locate an occurrence of a keyword as a substring in a sequence of characters, which is called the input string. For example, the input string queueing contains the keyword ueuei as a substring. Even for this problem, several innovative, theoretically interesting algorithms have been devised that run significantly faster than the obvious brute-force method.}
-}
+@inproceedings{Might2011,
+ abstract = {We present a functional approach to parsing unrestricted context-free grammars based on Brzozowski's derivative of regular expressions. If we consider context-free grammars as recursive regular expressions, Brzozowski's equational theory extends without modification to context-free grammars (and it generalizes to parser combinators). The supporting actors in this story are three concepts familiar to functional programmers - laziness, memoization and fixed points; these allow Brzozowski's original equations to be transliterated into purely functional code in about 30 lines spread over three functions.Yet, this almost impossibly brief implementation has a drawback: its performance is sour - in both theory and practice. The culprit? Each derivative can double the size of a grammar, and with it, the cost of the next derivative.Fortunately, much of the new structure inflicted by the derivative is either dead on arrival, or it dies after the very next derivative. To eliminate it, we once again exploit laziness and memoization to transliterate an equational theory that prunes such debris into working code. Thanks to this compaction, parsing times become reasonable in practice.We equip the functional programmer with two equational theories that, when combined, make for an abbreviated understanding and implementation of a system for parsing context-free languages.},
+ address = {New York, NY, USA},
+ author = {M.~Might and D.~Darais and D.~Spiewak},
+ booktitle = {Proceedings of the 16th ACM SIGPLAN International Conference on Functional Programming},
+ doi = {10.1145/2034773.2034801},
+ isbn = {9781450308656},
+ keywords = {derivative, parsing, context-free grammar, parser combinator, formal languages, regular expressions},
+ location = {Tokyo, Japan},
+ numpages = {7},
+ pages = {189--195},
+ publisher = {Association for Computing Machinery},
+ series = {ICFP '11},
+ title = {Parsing with Derivatives: A Functional Pearl},
+ url = {https://doi.org/10.1145/2034773.2034801},
+ year = {2011},
+ bdsk-url-1 = {https://doi.org/10.1145/2034773.2034801}}
-%% back-references--------------------
-%% -------------------------------------
+@article{10.1145/2034574.2034801,
+ abstract = {We present a functional approach to parsing unrestricted context-free grammars based on Brzozowski's derivative of regular expressions. If we consider context-free grammars as recursive regular expressions, Brzozowski's equational theory extends without modification to context-free grammars (and it generalizes to parser combinators). The supporting actors in this story are three concepts familiar to functional programmers - laziness, memoization and fixed points; these allow Brzozowski's original equations to be transliterated into purely functional code in about 30 lines spread over three functions.Yet, this almost impossibly brief implementation has a drawback: its performance is sour - in both theory and practice. The culprit? Each derivative can double the size of a grammar, and with it, the cost of the next derivative.Fortunately, much of the new structure inflicted by the derivative is either dead on arrival, or it dies after the very next derivative. To eliminate it, we once again exploit laziness and memoization to transliterate an equational theory that prunes such debris into working code. Thanks to this compaction, parsing times become reasonable in practice.We equip the functional programmer with two equational theories that, when combined, make for an abbreviated understanding and implementation of a system for parsing context-free languages.},
+ address = {New York, NY, USA},
+ author = {Might, Matthew and Darais, David and Spiewak, Daniel},
+ doi = {10.1145/2034574.2034801},
+ issn = {0362-1340},
+ issue_date = {September 2011},
+ journal = {SIGPLAN Not.},
+ keywords = {formal languages, derivative, context-free grammar, regular expressions, parsing, parser combinator},
+ month = {sep},
+ number = {9},
+ numpages = {7},
+ pages = {189--195},
+ publisher = {Association for Computing Machinery},
+ title = {Parsing with Derivatives: A Functional Pearl},
+ url = {https://doi.org/10.1145/2034574.2034801},
+ volume = {46},
+ year = {2011},
+ bdsk-url-1 = {https://doi.org/10.1145/2034574.2034801}}
-%%----------------------------------------------------------------
-%%----------------------------------------zippers
-@article{10.1145/3408990,
-author = {Darragh, Pierce and Adams, Michael D.},
-title = {Parsing with Zippers (Functional Pearl)},
-year = {2020},
-issue_date = {August 2020},
-publisher = {Association for Computing Machinery},
-address = {New York, NY, USA},
-volume = {4},
-number = {ICFP},
-url = {https://doi.org/10.1145/3408990},
-doi = {10.1145/3408990},
-abstract = {Parsing with Derivatives (PwD) is an elegant approach to parsing context-free grammars (CFGs). It takes the equational theory behind Brzozowski's derivative for regular expressions and augments that theory with laziness, memoization, and fixed points. The result is a simple parser for arbitrary CFGs. Although recent work improved the performance of PwD, it remains inefficient due to the algorithm repeatedly traversing some parts of the grammar. In this functional pearl, we show how to avoid this inefficiency by suspending the state of the traversal in a zipper. When subsequent derivatives are taken, we can resume the traversal from where we left off without retraversing already traversed parts of the grammar. However, the original zipper is designed for use with trees, and we want to parse CFGs. CFGs can include shared regions, cycles, and choices between alternates, which makes them incompatible with the traditional tree model for zippers. This paper develops a generalization of zippers to properly handle these additional features. Just as PwD generalized Brzozowski's derivatives from regular expressions to CFGs, we generalize Huet's zippers from trees to CFGs. Abstract The resulting parsing algorithm is concise and efficient: it takes only 31 lines of OCaml code to implement the derivative function but performs 6,500 times faster than the original PwD and 3.24 times faster than the optimized implementation of PwD.},
-journal = {Proc. ACM Program. Lang.},
-month = {aug},
-articleno = {108},
-numpages = {28},
-keywords = {Derivatives, Zippers, Parsing with Derivatives, Parsing}
+@inproceedings{Adams2016,
+ title={On the complexity and performance of parsing with derivatives},
+ author={Adams, Michael D and Hollenbeck, Celeste and Might, Matthew},
+ booktitle={Proceedings of the 37th ACM SIGPLAN Conference on Programming Language Design and Implementation},
+ pages={224--236},
+ year={2016}
}
-
-@article{Edelmann:287059,
- title = {Efficient Parsing with Derivatives and Zippers},
- author = {Edelmann, Romain},
- institution = {IINFCOM},
- publisher = {EPFL},
- address = {Lausanne},
- pages = {246},
- year = {2021},
- abstract = {Parsing is the process that enables a computer system to make sense of raw data. Parsing is common to almost all computer systems: It is involved every time sequential data is read and elaborated into structured data. The theory of parsing usually focuses on the binary recognition aspect of parsing and eschews this essential data-elaboration aspect. In this thesis, I present a declarative framework for value-aware parsing that explicitly integrates data elaboration.
+@article{Darragh2020,
+ abstract = {Parsing with Derivatives (PwD) is an elegant approach to parsing context-free grammars (CFGs). It takes the equational theory behind Brzozowski's derivative for regular expressions and augments that theory with laziness, memoization, and fixed points. The result is a simple parser for arbitrary CFGs. Although recent work improved the performance of PwD, it remains inefficient due to the algorithm repeatedly traversing some parts of the grammar. In this functional pearl, we show how to avoid this inefficiency by suspending the state of the traversal in a zipper. When subsequent derivatives are taken, we can resume the traversal from where we left off without retraversing already traversed parts of the grammar. However, the original zipper is designed for use with trees, and we want to parse CFGs. CFGs can include shared regions, cycles, and choices between alternates, which makes them incompatible with the traditional tree model for zippers. This paper develops a generalization of zippers to properly handle these additional features. Just as PwD generalized Brzozowski's derivatives from regular expressions to CFGs, we generalize Huet's zippers from trees to CFGs. Abstract The resulting parsing algorithm is concise and efficient: it takes only 31 lines of OCaml code to implement the derivative function but performs 6,500 times faster than the original PwD and 3.24 times faster than the optimized implementation of PwD.},
+ address = {New York, NY, USA},
+ articleno = {108},
+ author = {Darragh, Pierce and Adams, Michael D.},
+ doi = {10.1145/3408990},
+ issue_date = {August 2020},
+ journal = {Proc. ACM Program. Lang.},
+ keywords = {Derivatives, Zippers, Parsing with Derivatives, Parsing},
+ month = {aug},
+ number = {ICFP},
+ numpages = {28},
+ publisher = {Association for Computing Machinery},
+ title = {Parsing with Zippers (Functional Pearl)},
+ url = {https://doi.org/10.1145/3408990},
+ volume = {4},
+ year = {2020},
+ bdsk-url-1 = {https://doi.org/10.1145/3408990}}
+
+@article{Edelmann2021,
+ abstract = {Parsing is the process that enables a computer system to make sense of raw data. Parsing is common to almost all computer systems: It is involved every time sequential data is read and elaborated into structured data. The theory of parsing usually focuses on the binary recognition aspect of parsing and eschews this essential data-elaboration aspect. In this thesis, I present a declarative framework for value-aware parsing that explicitly integrates data elaboration.
Within the framework of the thesis, I present parsing algorithms that are based on the concept of Brzozowski's derivatives. Derivative-based parsing algorithms present several advantages: they are elegant, amenable to formal reasoning, and easy to implement. Unfortunately, the performance of these algorithms in practice is often not competitive with other approaches. In this thesis, I show a general technique inspired by Huet's Zipper to greatly enhance the performance of derivative-based algorithms, and I do so without compromising their elegance, amenability to formal reasoning, or ease of implementation.
First, I present a technique for building efficient tokenisers that is based on Brzozowski's derivatives and Huet's zipper and that does not require the usual burdensome explicit conversion to automata. I prove the technique is correct in Coq and present SILEX, a Scala lexing library based on the technique. I demonstrate that the approach is competitive with state-of-the-art solutions.
Then, I present a characterisation of LL(1) languages based on the concept of should-not-follow sets. I present an algorithm for parsing LL(1) languages with derivatives and zippers. I show a formal proof of the algorithm's correctness and prove its worst-case linear-time complexity. I show how the LL(1) parsing with derivatives and zippers algorithm corresponds to the traditional LL(1) parsing algorithm.
I then present SCALL1ON, a Scala parsing combinators library for LL(1) languages that incorporates the LL(1) parsing with derivatives and zippers algorithm. I present an expressive and familiar combinator-based interface for describing LL(1) languages. I present techniques that help precisely locate LL(1) conflicts in user code. I discuss several advantages of the parsing with derivatives approach within the context of a parsing library. I also present SCALL1ON's enumeration and pretty-printing features and discuss their implementation. Through a series of benchmarks, I demonstrate the good performance and practicality of the approach. Finally, I present how to adapt the LL(1) parsing with derivatives and zippers algorithm to support arbitrary context-free languages. I show how the adapted algorithm corresponds to general parsing algorithms, such as Earley's parsing algorithm.},
- url = {http://infoscience.epfl.ch/record/287059},
- doi = {10.5075/epfl-thesis-7357},
-}
+ address = {Lausanne},
+ author = {Edelmann, Romain},
+ doi = {10.5075/epfl-thesis-7357},
+ institution = {IINFCOM},
+ pages = {246},
+ publisher = {EPFL},
+ title = {Efficient Parsing with Derivatives and Zippers},
+ url = {http://infoscience.epfl.ch/record/287059},
+ year = {2021},
+ bdsk-url-1 = {http://infoscience.epfl.ch/record/287059},
+ bdsk-url-2 = {https://doi.org/10.5075/epfl-thesis-7357}}
@inproceedings{Zippy2020,
-author = {Edelmann, Romain and Hamza, Jad and Kun\v{c}ak, Viktor},
-title = {Zippy LL(1) Parsing with Derivatives},
-year = {2020},
-isbn = {9781450376136},
-publisher = {Association for Computing Machinery},
-address = {New York, NY, USA},
-url = {https://doi.org/10.1145/3385412.3385992},
-doi = {10.1145/3385412.3385992},
-abstract = {In this paper, we present an efficient, functional, and formally verified parsing algorithm for LL(1) context-free expressions based on the concept of derivatives of formal languages. Parsing with derivatives is an elegant parsing technique, which, in the general case, suffers from cubic worst-case time complexity and slow performance in practice. We specialise the parsing with derivatives algorithm to LL(1) context-free expressions, where alternatives can be chosen given a single token of lookahead. We formalise the notion of LL(1) expressions and show how to efficiently check the LL(1) property. Next, we present a novel linear-time parsing with derivatives algorithm for LL(1) expressions operating on a zipper-inspired data structure. We prove the algorithm correct in Coq and present an implementation as a part of Scallion, a parser combinators framework in Scala with enumeration and pretty printing capabilities.},
-booktitle = {Proceedings of the 41st ACM SIGPLAN Conference on Programming Language Design and Implementation},
-pages = {1036–1051},
-numpages = {16},
-keywords = {Parsing, Zipper, Formal proof, LL(1), Derivatives},
-location = {London, UK},
-series = {PLDI 2020}
+ abstract = {In this paper, we present an efficient, functional, and formally verified parsing algorithm for LL(1) context-free expressions based on the concept of derivatives of formal languages. Parsing with derivatives is an elegant parsing technique, which, in the general case, suffers from cubic worst-case time complexity and slow performance in practice. We specialise the parsing with derivatives algorithm to LL(1) context-free expressions, where alternatives can be chosen given a single token of lookahead. We formalise the notion of LL(1) expressions and show how to efficiently check the LL(1) property. Next, we present a novel linear-time parsing with derivatives algorithm for LL(1) expressions operating on a zipper-inspired data structure. We prove the algorithm correct in Coq and present an implementation as a part of Scallion, a parser combinators framework in Scala with enumeration and pretty printing capabilities.},
+ address = {New York, NY, USA},
+ author = {Edelmann, Romain and Hamza, Jad and Kun\v{c}ak, Viktor},
+ booktitle = {Proceedings of the 41st ACM SIGPLAN Conference on Programming Language Design and Implementation},
+ doi = {10.1145/3385412.3385992},
+ isbn = {9781450376136},
+ keywords = {Parsing, Zipper, Formal proof, LL(1), Derivatives},
+ location = {London, UK},
+ numpages = {16},
+ pages = {1036--1051},
+ publisher = {Association for Computing Machinery},
+ series = {PLDI 2020},
+ title = {Zippy LL(1) Parsing with Derivatives},
+ url = {https://doi.org/10.1145/3385412.3385992},
+ year = {2020},
+ bdsk-url-1 = {https://doi.org/10.1145/3385412.3385992}}
+
+@article{fowler2003,
+ author = {Fowler, Glenn},
+ journal = {URL: https://web. archive. org/web/20050408073627/http://www. research. att. com/\~{} gsf/testregex/re-interpretation. html},
+ title = {An interpretation of the POSIX regex standard},
+ year = {2003}}
+
+@inproceedings{Snort1999,
+ abstract = {Network intrusion detection systems (NIDS) are an important part of any network security architecture. They provide a layer of defense which monitors network traffic for predefined suspicious activity or patterns, and alert system administrators when potential hostile traffic is detected. Commercial NIDS have many differences, but Information Systems departments must face the commonalities that they share such as significant system footprint, complex deployment and high monetary cost. Snort was designed to address these issues.},
+ address = {USA},
+ author = {M.~Roesch},
+ booktitle = {Proceedings of the 13th USENIX Conference on System Administration},
+ location = {Seattle, Washington},
+ numpages = {10},
+ pages = {229--238},
+ publisher = {USENIX Association},
+ series = {LISA '99},
+ title = {Snort - Lightweight Intrusion Detection for Networks},
+ year = {1999}}
+
+
+
+@inproceedings{Bro,
+author = {V.~Paxson},
+title = {Bro: A System for Detecting Network Intruders in Real-Time},
+year = {1998},
+publisher = {USENIX Association},
+address = {USA},
+abstract = {We describe Bro, a stand-alone system for detecting network intruders in real-time by passively monitoring a network link over which the intruder's traffic transits. We give an overview of the system's design, which emphasizes high-speed (FDDI-rate) monitoring, real-time notification, clear separation between mechanism and policy, and extensibility. To achieve these ends, Bro is divided into an "event engine" that reduces a kernel-filtered network traffic stream into a series of higher-level events, and a "policy script interpreter" that interprets event handlers written in a specialized language used to express a site's security policy. Event handlers can update state information, synthesize new events, record information to disk, and generate real-time notifications via syslog. We also discuss a number of attacks that attempt to subvert passive monitoring systems and defenses against these, and give particulars of how Bro analyzes the four applications integrated into it so far: Finger, FTP, Portmapper and Telnet. The system is publicly available in source code form.},
+booktitle = {Proceedings of the 7th Conference on USENIX Security Symposium - Volume 7},
+pages = {3},
+numpages = {1},
+location = {San Antonio, Texas},
+series = {SSYM'98}
}
-%%----------------------------------------zippers
-%%----------------------------------------------------------------
+@inproceedings{Becchi08,
+ author = {Becchi, Michela and Crowley, Patrick},
+ doi = {10.1145/1544012.1544037},
+ month = {01},
+ pages = {25},
+ title = {Extending finite automata to efficiently match Perl-compatible regular expressions},
+ year = {2008},
+ bdsk-url-1 = {https://doi.org/10.1145/1544012.1544037}}
+
+@book{Sakarovitch2009,
+ author = {Sakarovitch, Jacques},
+ doi = {10.1017/CBO9781139195218},
+ editor = {Thomas, ReubenTranslator},
+ place = {Cambridge},
+ publisher = {Cambridge University Press},
+ title = {Elements of Automata Theory},
+ year = {2009},
+ bdsk-url-1 = {https://doi.org/10.1017/CBO9781139195218}}
+
+@unpublished{CSL2022,
+ author = {Chengsong Tan and Christian Urban},
+ note = {submitted},
+ title = {POSIX Lexing with Bitcoded Derivatives}}
+
+@inproceedings{Verbatim,
+ author = {Egolf, Derek and Lasser, Sam and Fisher, Kathleen},
+ booktitle = {2021 IEEE Security and Privacy Workshops (SPW)},
+ doi = {10.1109/SPW53761.2021.00022},
+ pages = {92-100},
+ title = {Verbatim: A Verified Lexer Generator},
+ year = {2021},
+ bdsk-url-1 = {https://doi.org/10.1109/SPW53761.2021.00022}}
-@article{fowler2003,
- title={An interpretation of the POSIX regex standard},
- author={Fowler, Glenn},
- journal={URL: https://web. archive. org/web/20050408073627/http://www. research. att. com/\~{} gsf/testregex/re-interpretation. html},
- year={2003}
+@inproceedings{Nipkow1998,
+ author = {Tobias Nipkow},
+ editor = {Jim Grundy and
+ Malcolm C. Newey},
+ title = {Verified Lexical Analysis},
+ booktitle = {Theorem Proving in Higher Order Logics, 11th International Conference,
+ TPHOLs'98, Canberra, Australia, September 27 - October 1, 1998, Proceedings},
+ series = {Lecture Notes in Computer Science},
+ volume = {1479},
+ pages = {1--15},
+ publisher = {Springer},
+ year = {1998},
+ url = {https://doi.org/10.1007/BFb0055126},
+ doi = {10.1007/BFb0055126},
+ timestamp = {Tue, 14 May 2019 10:00:48 +0200},
+ biburl = {https://dblp.org/rec/conf/tphol/Nipkow98.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
}
-@inproceedings{Snort1999,
-author = {Roesch, Martin},
-title = {Snort - Lightweight Intrusion Detection for Networks},
-year = {1999},
-publisher = {USENIX Association},
-address = {USA},
-abstract = {Network intrusion detection systems (NIDS) are an important part of any network security architecture. They provide a layer of defense which monitors network traffic for predefined suspicious activity or patterns, and alert system administrators when potential hostile traffic is detected. Commercial NIDS have many differences, but Information Systems departments must face the commonalities that they share such as significant system footprint, complex deployment and high monetary cost. Snort was designed to address these issues.},
-booktitle = {Proceedings of the 13th USENIX Conference on System Administration},
-pages = {229–238},
-numpages = {10},
-location = {Seattle, Washington},
-series = {LISA '99}
-}
-
-@inproceedings{Becchi08,
-author = {Becchi, Michela and Crowley, Patrick},
-year = {2008},
-month = {01},
-pages = {25},
-title = {Extending finite automata to efficiently match Perl-compatible regular expressions},
-doi = {10.1145/1544012.1544037}
-}
-
-@book{
- Sakarovitch2009,
- place={Cambridge},
- title={Elements of Automata Theory},
- DOI={10.1017/CBO9781139195218},
- publisher={Cambridge University Press},
- author={Sakarovitch, Jacques},
- editor={Thomas, ReubenTranslator},
- year={2009}
-}
-
-
-@unpublished{CSL2022,
-author = "Chengsong Tan and Christian Urban",
-title = "POSIX Lexing with Bitcoded Derivatives",
-note = "submitted",
-}
-
-@INPROCEEDINGS{Verbatim, author={Egolf, Derek and Lasser, Sam and Fisher, Kathleen}, booktitle={2021 IEEE Security and Privacy Workshops (SPW)}, title={Verbatim: A Verified Lexer Generator}, year={2021}, volume={}, number={}, pages={92-100}, doi={10.1109/SPW53761.2021.00022}}
-
-
@inproceedings{Verbatimpp,
-author = {Egolf, Derek and Lasser, Sam and Fisher, Kathleen},
-title = {Verbatim++: Verified, Optimized, and Semantically Rich Lexing with Derivatives},
-year = {2022},
-isbn = {9781450391825},
-publisher = {Association for Computing Machinery},
-address = {New York, NY, USA},
-url = {https://doi.org/10.1145/3497775.3503694},
-doi = {10.1145/3497775.3503694},
-abstract = {Lexers and parsers are attractive targets for attackers because they often sit at the boundary between a software system's internals and the outside world. Formally verified lexers can reduce the attack surface of these systems, thus making them more secure. One recent step in this direction is the development of Verbatim, a verified lexer based on the concept of Brzozowski derivatives. Two limitations restrict the tool's usefulness. First, its running time is quadratic in the length of its input string. Second, the lexer produces tokens with a simple "tag and string" representation, which limits the tool's ability to integrate with parsers that operate on more expressive token representations. In this work, we present a suite of extensions to Verbatim that overcomes these limitations while preserving the tool's original correctness guarantees. The lexer achieves effectively linear performance on a JSON benchmark through a combination of optimizations that, to our knowledge, has not been previously verified. The enhanced version of Verbatim also enables users to augment their lexical specifications with custom semantic actions, and it uses these actions to produce semantically rich tokens---i.e., tokens that carry values with arbitrary, user-defined types. All extensions were implemented and verified with the Coq Proof Assistant.},
-booktitle = {Proceedings of the 11th ACM SIGPLAN International Conference on Certified Programs and Proofs},
-pages = {27–39},
-numpages = {13},
-keywords = {Brzozowski derivatives, formal verification, lexical analysis, semantic actions},
-location = {Philadelphia, PA, USA},
-series = {CPP 2022}
-}
-
+ abstract = {Lexers and parsers are attractive targets for attackers because they often sit at the boundary between a software system's internals and the outside world. Formally verified lexers can reduce the attack surface of these systems, thus making them more secure. One recent step in this direction is the development of Verbatim, a verified lexer based on the concept of Brzozowski derivatives. Two limitations restrict the tool's usefulness. First, its running time is quadratic in the length of its input string. Second, the lexer produces tokens with a simple "tag and string" representation, which limits the tool's ability to integrate with parsers that operate on more expressive token representations. In this work, we present a suite of extensions to Verbatim that overcomes these limitations while preserving the tool's original correctness guarantees. The lexer achieves effectively linear performance on a JSON benchmark through a combination of optimizations that, to our knowledge, has not been previously verified. The enhanced version of Verbatim also enables users to augment their lexical specifications with custom semantic actions, and it uses these actions to produce semantically rich tokens---i.e., tokens that carry values with arbitrary, user-defined types. All extensions were implemented and verified with the Coq Proof Assistant.},
+ address = {New York, NY, USA},
+ author = {Egolf, Derek and Lasser, Sam and Fisher, Kathleen},
+ booktitle = {Proceedings of the 11th ACM SIGPLAN International Conference on Certified Programs and Proofs},
+ doi = {10.1145/3497775.3503694},
+ isbn = {9781450391825},
+ keywords = {Brzozowski derivatives, formal verification, lexical analysis, semantic actions},
+ location = {Philadelphia, PA, USA},
+ numpages = {13},
+ pages = {27--39},
+ publisher = {Association for Computing Machinery},
+ series = {CPP 2022},
+ title = {Verbatim++: Verified, Optimized, and Semantically Rich Lexing with Derivatives},
+ url = {https://doi.org/10.1145/3497775.3503694},
+ year = {2022},
+ bdsk-url-1 = {https://doi.org/10.1145/3497775.3503694}}
@article{Turo_ov__2020,
author = {Lenka Turo{\v{n}}ov{\'{a}} and Luk{\'{a}}{\v{s}} Hol{\'{\i}}k and Ond{\v{r}}ej Leng{\'{a}}l and Olli Saarikivi and Margus Veanes and Tom{\'{a}}{\v{s}} Vojnar},
@@ -464,7 +554,6 @@
title = {Static Analysis for Regular Expression Exponential Runtime via Substructural Logics},
year = {2017}}
-
@article{alfred2014algorithms,
author = {Alfred, V},
journal = {Algorithms and Complexity},
@@ -474,7 +563,6 @@
volume = {1},
year = {2014}}
-
@article{nielson11bcre,
author = {Lasse Nielsen, Fritz Henglein},
date-added = {2019-07-03 21:09:39 +0000},
@@ -504,8 +592,13 @@
title = {PCRE},
url = {https://www.pcre.org/original/doc/html/},
year = {2021},
- bdsk-url-1 = {https://www.pcre.org/original/doc/html/}
-}
+ bdsk-url-1 = {https://www.pcre.org/original/doc/html/}}
+
+@misc{communityRules,
+ howpublished = {\url{https://www.snort.org/faq/what-are-community-rules}},
+ note = {[Online; last accessed 19-November-2022]},
+ title = {{Snort Community Rules}},
+ year = {2022}}
@misc{KuklewiczHaskell,
author = {Kuklewicz},
@@ -538,23 +631,22 @@
year = {2014}}
@inproceedings{xml2015,
-author = {Bj\"{o}rklund, Henrik and Martens, Wim and Timm, Thomas},
-title = {Efficient Incremental Evaluation of Succinct Regular Expressions},
-year = {2015},
-isbn = {9781450337946},
-publisher = {Association for Computing Machinery},
-address = {New York, NY, USA},
-url = {https://doi.org/10.1145/2806416.2806434},
-doi = {10.1145/2806416.2806434},
-abstract = {Regular expressions are omnipresent in database applications. They form the structural core of schema languages for XML, they are a fundamental ingredient for navigational queries in graph databases, and are being considered in languages for upcoming technologies such as schema- and transformation languages for tabular data on the Web. In this paper we study the usage and effectiveness of the counting operator (or: limited repetition) in regular expressions. The counting operator is a popular extension which is part of the POSIX standard and therefore also present in regular expressions in grep, Java, Python, Perl, and Ruby. In a database context, expressions with counting appear in XML Schema and languages for querying graphs such as SPARQL 1.1 and Cypher.We first present a practical study that suggests that counters are extensively used in practice. We then investigate evaluation methods for such expressions and develop a new algorithm for efficient incremental evaluation. Finally, we conduct an extensive benchmark study that shows that exploiting counting operators can lead to speed-ups of several orders of magnitude in a wide range of settings: normal and incremental evaluation on synthetic and real expressions.},
-booktitle = {Proceedings of the 24th ACM International on Conference on Information and Knowledge Management},
-pages = {1541–1550},
-numpages = {10},
-keywords = {regular expressions, schema, regular path queries, xml},
-location = {Melbourne, Australia},
-series = {CIKM '15}
-}
-
+ abstract = {Regular expressions are omnipresent in database applications. They form the structural core of schema languages for XML, they are a fundamental ingredient for navigational queries in graph databases, and are being considered in languages for upcoming technologies such as schema- and transformation languages for tabular data on the Web. In this paper we study the usage and effectiveness of the counting operator (or: limited repetition) in regular expressions. The counting operator is a popular extension which is part of the POSIX standard and therefore also present in regular expressions in grep, Java, Python, Perl, and Ruby. In a database context, expressions with counting appear in XML Schema and languages for querying graphs such as SPARQL 1.1 and Cypher.We first present a practical study that suggests that counters are extensively used in practice. We then investigate evaluation methods for such expressions and develop a new algorithm for efficient incremental evaluation. Finally, we conduct an extensive benchmark study that shows that exploiting counting operators can lead to speed-ups of several orders of magnitude in a wide range of settings: normal and incremental evaluation on synthetic and real expressions.},
+ address = {New York, NY, USA},
+ author = {Bj\"{o}rklund, Henrik and Martens, Wim and Timm, Thomas},
+ booktitle = {Proceedings of the 24th ACM International on Conference on Information and Knowledge Management},
+ doi = {10.1145/2806416.2806434},
+ isbn = {9781450337946},
+ keywords = {regular expressions, schema, regular path queries, xml},
+ location = {Melbourne, Australia},
+ numpages = {10},
+ pages = {1541--1550},
+ publisher = {Association for Computing Machinery},
+ series = {CIKM '15},
+ title = {Efficient Incremental Evaluation of Succinct Regular Expressions},
+ url = {https://doi.org/10.1145/2806416.2806434},
+ year = {2015},
+ bdsk-url-1 = {https://doi.org/10.1145/2806416.2806434}}
@misc{SE16,
author = {StackStatus},
@@ -803,8 +895,6 @@
volume = {7086},
year = {2011}}
-
-
@inproceedings{Almeidaetal10,
author = {J.~B.~Almeida and N.~Moriera and D.~Pereira and S.~M.~de Sousa},
booktitle = {Proc.~of the 15th International Conference on Implementation and Application of Automata (CIAA)},
@@ -843,23 +933,6 @@
url = {http://www.pcre.org},
bdsk-url-1 = {http://www.pcre.org}}
-%@inproceedings{OkuiSuzuki2010,
-% author = {S.~Okui and T.~Suzuki},
-% booktitle = {Proc.~of the 15th International Conference on Implementation and Application of Automata (CIAA)},
-% pages = {231--240},
-% series = {LNCS},
-% title = {{D}isambiguation in {R}egular {E}xpression {M}atching via {P}osition {A}utomata with {A}ugmented {T}ransitions},
-% volume = {6482},
-% year = {2010}}
-%
-
-
-%@techreport{OkuiSuzukiTech,
-% author = {S.~Okui and T.~Suzuki},
-% institution = {University of Aizu},
-% title = {{D}isambiguation in {R}egular {E}xpression {M}atching via {P}osition {A}utomata with {A}ugmented {T}ransitions},
-% year = {2013}}
-
@inproceedings{Davis18,
author = {J.~C.~Davis and C.~.A.~Coghlan and F.~Servant and D.~Lee},
booktitle = {Proc.~of the 26th ACM Joint Meeting on European Software Engineering Conference and Symposium on the Foundations of Software Engineering (ESEC/FSE)},