146
|
1 |
\documentclass[a4paper,UKenglish]{lipics}
|
|
2 |
\usepackage{graphic}
|
|
3 |
\usepackage{data}
|
|
4 |
\usepackage{tikz-cd}
|
|
5 |
\usepackage{tikz}
|
|
6 |
|
|
7 |
%\usetikzlibrary{graphs}
|
|
8 |
%\usetikzlibrary{graphdrawing}
|
|
9 |
%\usegdlibrary{trees}
|
|
10 |
|
|
11 |
%\usepackage{algorithm}
|
|
12 |
\usepackage{amsmath}
|
|
13 |
\usepackage{xcolor}
|
|
14 |
\usepackage[noend]{algpseudocode}
|
|
15 |
\usepackage{enumitem}
|
|
16 |
\usepackage{nccmath}
|
|
17 |
\usepackage{soul}
|
|
18 |
|
|
19 |
\definecolor{darkblue}{rgb}{0,0,0.6}
|
|
20 |
\hypersetup{colorlinks=true,allcolors=darkblue}
|
|
21 |
\newcommand{\comment}[1]%
|
|
22 |
{{\color{red}$\Rightarrow$}\marginpar{\raggedright\small{\bf\color{red}#1}}}
|
|
23 |
|
|
24 |
% \documentclass{article}
|
|
25 |
%\usepackage[utf8]{inputenc}
|
|
26 |
%\usepackage[english]{babel}
|
|
27 |
%\usepackage{listings}
|
|
28 |
% \usepackage{amsthm}
|
|
29 |
%\usepackage{hyperref}
|
|
30 |
% \usepackage[margin=0.5in]{geometry}
|
|
31 |
%\usepackage{pmboxdraw}
|
|
32 |
|
|
33 |
\title{POSIX Regular Expression Matching and Lexing}
|
|
34 |
\author{Chengsong Tan}
|
|
35 |
\affil{King's College London\\
|
|
36 |
London, UK\\
|
|
37 |
\texttt{chengsong.tan@kcl.ac.uk}}
|
|
38 |
\authorrunning{Chengsong Tan}
|
|
39 |
\Copyright{Chengsong Tan}
|
|
40 |
|
|
41 |
\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
|
|
42 |
\newcommand{\ZERO}{\mbox{\bf 0}}
|
|
43 |
\newcommand{\ONE}{\mbox{\bf 1}}
|
|
44 |
\def\erase{\textit{erase}}
|
|
45 |
\def\bders{\textit{bders}}
|
|
46 |
\def\lexer{\mathit{lexer}}
|
|
47 |
\def\blexer{\textit{blexer}}
|
|
48 |
\def\fuse{\textit{fuse}}
|
|
49 |
\def\flatten{\textit{flatten}}
|
|
50 |
\def\map{\textit{map}}
|
|
51 |
\def\blexers{\mathit{blexer\_simp}}
|
|
52 |
\def\simp{\mathit{simp}}
|
|
53 |
\def\mkeps{\mathit{mkeps}}
|
|
54 |
\def\bmkeps{\textit{bmkeps}}
|
|
55 |
\def\inj{\mathit{inj}}
|
|
56 |
\def\Empty{\mathit{Empty}}
|
|
57 |
\def\Left{\mathit{Left}}
|
|
58 |
\def\Right{\mathit{Right}}
|
|
59 |
\def\Stars{\mathit{Stars}}
|
|
60 |
\def\Char{\mathit{Char}}
|
|
61 |
\def\Seq{\mathit{Seq}}
|
|
62 |
\def\Der{\mathit{Der}}
|
|
63 |
\def\nullable{\mathit{nullable}}
|
|
64 |
\def\Z{\mathit{Z}}
|
|
65 |
\def\S{\mathit{S}}
|
|
66 |
\def\flex{\textit{flex}}
|
|
67 |
\def\rup{r^\uparrow}
|
|
68 |
\def\retrieve{\textit{retrieve}}
|
|
69 |
\def\AALTS{\textit{AALTS}}
|
|
70 |
\def\AONE{\textit{AONE}}
|
|
71 |
%\theoremstyle{theorem}
|
|
72 |
%\newtheorem{theorem}{Theorem}
|
|
73 |
%\theoremstyle{lemma}
|
|
74 |
%\newtheorem{lemma}{Lemma}
|
|
75 |
%\newcommand{\lemmaautorefname}{Lemma}
|
|
76 |
%\theoremstyle{definition}
|
|
77 |
%\newtheorem{definition}{Definition}
|
|
78 |
\algnewcommand\algorithmicswitch{\textbf{switch}}
|
|
79 |
\algnewcommand\algorithmiccase{\textbf{case}}
|
|
80 |
\algnewcommand\algorithmicassert{\texttt{assert}}
|
|
81 |
\algnewcommand\Assert[1]{\State \algorithmicassert(#1)}%
|
|
82 |
% New "environments"
|
|
83 |
\algdef{SE}[SWITCH]{Switch}{EndSwitch}[1]{\algorithmicswitch\ #1\ \algorithmicdo}{\algorithmicend\ \algorithmicswitch}%
|
|
84 |
\algdef{SE}[CASE]{Case}{EndCase}[1]{\algorithmiccase\ #1}{\algorithmicend\ \algorithmiccase}%
|
|
85 |
\algtext*{EndSwitch}%
|
|
86 |
\algtext*{EndCase}%
|
|
87 |
|
|
88 |
|
|
89 |
\begin{document}
|
|
90 |
|
|
91 |
\maketitle
|
147
|
92 |
Suppose (basic) regular expressions are given by the following grammar:
|
146
|
93 |
|
147
|
94 |
\[ r ::= \ZERO \mid \ONE
|
|
95 |
\mid c
|
|
96 |
\mid r_1 \cdot r_2
|
|
97 |
\mid r_1 + r_2
|
|
98 |
\mid r^*
|
|
99 |
\]
|
|
100 |
|
|
101 |
|
|
102 |
If we let the alphabet
|
|
103 |
where $c$ is selected from
|
|
104 |
be $\sum = \{0,1\}$,
|
|
105 |
then bitcodes can be defined in a
|
|
106 |
regular expression style:
|
|
107 |
|
|
108 |
|
|
109 |
\[ bs ::= \ZERO \mid \ONE
|
|
110 |
\mid 1
|
|
111 |
\mid 0
|
|
112 |
\mid bs_1 \cdot bs_2
|
|
113 |
\mid \sum{bs_{list}}
|
|
114 |
\mid bs^*
|
|
115 |
\]
|
|
116 |
|
|
117 |
We can define an isomorphism between the regex
|
|
118 |
definition of bitcodes and our list definition of bitcodes:
|
146
|
119 |
\begin{center}
|
147
|
120 |
$b ::= 1 \mid 0 \qquad
|
|
121 |
bs ::= [] \mid b::bs
|
|
122 |
$
|
|
123 |
\end{center}
|
|
124 |
For example we can let $\sigma([])= \ONE$.
|
|
125 |
But how to define such isomorphism in detail is not explicitly needed for now.
|
|
126 |
|
|
127 |
\emph{Annotated regular expressions} can be defined by the following
|
|
128 |
grammar using new $bs$ definition:
|
|
129 |
|
|
130 |
\begin{center}
|
|
131 |
\begin{tabular}{lcl}
|
|
132 |
$\textit{a}$ & $::=$ & $\ZERO$\\
|
|
133 |
& $\mid$ & $_{bs}\ONE$\\
|
|
134 |
& $\mid$ & $_{bs}{\bf c}$\\
|
|
135 |
& $\mid$ & $_{bs}\sum\,as$\\
|
|
136 |
& $\mid$ & $_{bs}a_1\cdot a_2$\\
|
|
137 |
& $\mid$ & $_{bs}a^*$
|
|
138 |
\end{tabular}
|
|
139 |
\end{center}
|
|
140 |
Let the set of all bitcoded regular expressions be $\textit{BS}$.
|
|
141 |
Let the set of all annotated regular expression be $\textit{AR}$.
|
|
142 |
Let us play with the function $f: \textit{AR} \rightarrow \textit{BS}$ on annotated regular expressions:
|
|
143 |
\begin{center}
|
|
144 |
$f(\ZERO) = \ZERO$\\
|
146
|
145 |
$f(_{bs}\ONE) = \textit{bs}$\\
|
|
146 |
$f(_{bs}a) = \textit{bs} $\\
|
147
|
147 |
$f(_{bs}r_1\cdot r_2) = \textit{bs} \cdot( f(r_1) \cdot f(r_2))$\\
|
146
|
148 |
$f(_{bs}\sum{rs}) = \textit{bs} \cdot \sum\limits_{r \in rs}{f(\textit{r})}$\\
|
147
|
149 |
$f(_{bs}r^*) = \textit{bs} \cdot((0 \cdot f(r))^*\cdot 1) $
|
146
|
150 |
\end{center}
|
|
151 |
|
147
|
152 |
We claim that:
|
|
153 |
\begin{center}
|
|
154 |
$f(a) = \{bs \mid a \gg bs\}$.
|
|
155 |
\end{center}
|
146
|
156 |
|
|
157 |
\bibliographystyle{plain}
|
|
158 |
\bibliography{root}
|
|
159 |
|
|
160 |
|
|
161 |
\end{document}
|
|
162 |
|