thys3/Paper.thy
author Chengsong
Wed, 23 Aug 2023 03:02:31 +0100
changeset 668 3831621d7b14
parent 644 9f984ff20020
permissions -rw-r--r--
added technical Overview section, almost done introduction
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
     1
(*<*)
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
     2
theory Paper
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
     3
  imports
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
     4
   "Posix.LexerSimp" 
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
     5
   "Posix.FBound" 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
     6
   "HOL-Library.LaTeXsugar"
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
     7
begin
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
     8
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
     9
declare [[show_question_marks = false]]
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    10
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    11
notation (latex output)
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    12
  If  ("(\<^latex>\<open>\\textrm{\<close>if\<^latex>\<open>}\<close> (_)/ \<^latex>\<open>\\textrm{\<close>then\<^latex>\<open>}\<close> (_)/ \<^latex>\<open>\\textrm{\<close>else\<^latex>\<open>}\<close> (_))" 10) and
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    13
  Cons ("_\<^latex>\<open>\\mbox{$\\,$}\<close>::\<^latex>\<open>\\mbox{$\\,$}\<close>_" [75,73] 73) 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    14
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    15
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    16
abbreviation 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    17
  "der_syn r c \<equiv> der c r"
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    18
abbreviation 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    19
 "ders_syn r s \<equiv> ders s r"  
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    20
abbreviation 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    21
  "bder_syn r c \<equiv> bder c r"  
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    22
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    23
notation (latex output)
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    24
  der_syn ("_\\_" [79, 1000] 76) and
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    25
  ders_syn ("_\\_" [79, 1000] 76) and
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    26
  bder_syn ("_\\_" [79, 1000] 76) and
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    27
  bders ("_\\_" [79, 1000] 76) and
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    28
  bders_simp ("_\\\<^sub>b\<^sub>s\<^sub>i\<^sub>m\<^sub>p _" [79, 1000] 76) and
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    29
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    30
  ZERO ("\<^bold>0" 81) and 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    31
  ONE ("\<^bold>1" 81) and 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    32
  CH ("_" [1000] 80) and
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
    33
  ALT ("_ + _" [76,76] 77) and
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
    34
  SEQ ("_ \<cdot> _" [78,78] 78) and
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    35
  STAR ("_\<^sup>*" [79] 78) and
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
    36
  NTIMES ("_\<^sup>{\<^bsup>_\<^esup>\<^sup>}" [79] 78) and
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    37
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    38
  val.Void ("Empty" 78) and
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    39
  val.Char ("Char _" [1000] 78) and
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    40
  val.Left ("Left _" [79] 78) and
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    41
  val.Right ("Right _" [1000] 78) and
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    42
  val.Seq ("Seq _ _" [79,79] 78) and
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
    43
  val.Stars ("Stars _" [1000] 78) and
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    44
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    45
  Prf ("\<turnstile> _ : _" [75,75] 75) and  
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    46
  Posix ("'(_, _') \<rightarrow> _" [63,75,75] 75) and
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    47
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    48
  flat ("|_|" [75] 74) and
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    49
  flats ("|_|" [72] 74) and
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    50
  injval ("inj _ _ _" [79,77,79] 76) and 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    51
  mkeps ("mkeps _" [79] 76) and 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    52
  length ("len _" [73] 73) and
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    53
  set ("_" [73] 73) and
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    54
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    55
  AZERO ("ZERO" 81) and 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    56
  AONE ("ONE _" [79] 78) and 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    57
  ACHAR ("CHAR _ _" [79, 79] 80) and
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    58
  AALTs ("ALTs _ _" [77,77] 78) and
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    59
  ASEQ ("SEQ _ _ _" [79, 79,79] 78) and
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    60
  ASTAR ("STAR _ _" [79, 79] 78) and
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
    61
  ANTIMES ("NT _ _ _" [79, 79, 79] 78) and
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    62
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    63
  code ("code _" [79] 74) and
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    64
  intern ("_\<^latex>\<open>\\mbox{$^\\uparrow$}\<close>" [900] 80) and
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    65
  erase ("_\<^latex>\<open>\\mbox{$^\\downarrow$}\<close>" [1000] 74) and
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    66
  bnullable ("bnullable _" [1000] 80) and
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    67
  bsimp_AALTs ("bsimpALT _ _" [10,1000] 80) and
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    68
  bsimp_ASEQ ("bsimpSEQ _ _ _" [10,1000,1000] 80) and
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    69
  bmkeps ("bmkeps _" [1000] 80) and
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    70
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    71
  eq1 ("_ \<approx> _" [77,77] 76) and 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    72
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    73
  srewrite ("_\<^latex>\<open>\\mbox{$\\,\\stackrel{s}{\\leadsto}$}\<close> _" [71, 71] 80) and
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    74
  rrewrites ("_ \<^latex>\<open>\\mbox{$\\,\\leadsto^*$}\<close> _" [71, 71] 80) and
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    75
  srewrites ("_ \<^latex>\<open>\\mbox{$\\,\\stackrel{s}{\\leadsto}^*$}\<close> _" [71, 71] 80) and
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    76
  blexer_simp ("blexer\<^sup>+" 1000) 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    77
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    78
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    79
lemma better_retrieve:
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    80
   shows "rs \<noteq> Nil ==> retrieve (AALTs bs (r#rs)) (Left v) = bs @ retrieve r v"
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    81
   and   "rs \<noteq> Nil ==> retrieve (AALTs bs (r#rs)) (Right v) = bs @ retrieve (AALTs [] rs) v"
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    82
  apply (metis list.exhaust retrieve.simps(4))
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    83
  by (metis list.exhaust retrieve.simps(5))
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    84
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    85
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
    86
lemma better_retrieve2:
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
    87
  shows "retrieve (ANTIMES bs r (n + 1)) (Stars (v#vs)) = 
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
    88
     bs @ [Z] @ retrieve r v @ retrieve (ANTIMES [] r n) (Stars vs)"
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
    89
by auto
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    90
(*>*)
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    91
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    92
section {* Introduction *}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    93
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    94
text {*
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    95
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    96
In the last fifteen or so years, Brzozowski's derivatives of regular
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    97
expressions have sparked quite a bit of interest in the functional
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
    98
programming and theorem prover communities.
563
c92a41d9c4da updated paper and theories to include n-times
Christian Urban <christian.urban@kcl.ac.uk>
parents: 502
diff changeset
    99
Derivatives of a
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   100
regular expressions, written @{term "der c r"}, give a simple solution
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   101
to the problem of matching a string @{term s} with a regular
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   102
expression @{term r}: if the derivative of @{term r} w.r.t.\ (in
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   103
succession) all the characters of the string matches the empty string,
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   104
then @{term r} matches @{term s} (and {\em vice versa}).  
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   105
The beauty of
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   106
Brzozowski's derivatives \cite{Brzozowski1964} is that they are neatly
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   107
expressible in any functional language, and easily definable and
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   108
reasoned about in theorem provers---the definitions just consist of
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   109
inductive datatypes and simple recursive functions. Another attractive
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   110
feature of derivatives is that they can be easily extended to \emph{bounded}
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   111
regular expressions, such as @{term "r"}$^{\{@{text n}\}}$ or @{term r}$^{\{..@{text n}\}}$, where numbers or
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   112
intervals of numbers specify how many times a regular expression should be used
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   113
during matching.
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   114
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   115
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   116
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   117
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   118
However, there are two difficulties with derivative-based matchers:
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   119
First, Brzozowski's original matcher only generates a yes/no answer
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   120
for whether a regular expression matches a string or not.  This is too
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   121
little information in the context of lexing where separate tokens must
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   122
be identified and also classified (for example as keywords
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   123
or identifiers).  Sulzmann and Lu~\cite{Sulzmann2014} overcome this
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   124
difficulty by cleverly extending Brzozowski's matching
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   125
algorithm. Their extended version generates additional information on
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   126
\emph{how} a regular expression matches a string following the POSIX
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   127
rules for regular expression matching. They achieve this by adding a
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   128
second ``phase'' to Brzozowski's algorithm involving an injection
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   129
function.  In our own earlier work we provided the formal
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   130
specification of what POSIX matching means and proved in Isabelle/HOL
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   131
the correctness
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   132
of Sulzmann and Lu's extended algorithm accordingly
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   133
\cite{AusafDyckhoffUrban2016}.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   134
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   135
The second difficulty is that Brzozowski's derivatives can 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   136
grow to arbitrarily big sizes. For example if we start with the
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   137
regular expression \mbox{@{text "(a + aa)\<^sup>*"}} and take
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   138
successive derivatives according to the character $a$, we end up with
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   139
a sequence of ever-growing derivatives like 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   140
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   141
\def\ll{\stackrel{\_\backslash{} a}{\longrightarrow}}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   142
\begin{center}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   143
\begin{tabular}{rll}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   144
$(a + aa)^*$ & $\ll$ & $(\ONE + \ONE{}a) \cdot (a + aa)^*$\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   145
& $\ll$ & $(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* \;+\; (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   146
& $\ll$ & $(\ZERO + \ZERO{}a + \ZERO) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^* \;+\; $\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   147
& & $\qquad(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   148
& $\ll$ & \ldots \hspace{15mm}(regular expressions of sizes 98, 169, 283, 468, 767, \ldots)
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   149
\end{tabular}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   150
\end{center}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   151
 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   152
\noindent where after around 35 steps we run out of memory on a
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   153
typical computer (we shall define shortly the precise details of our
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   154
regular expressions and the derivative operation).  Clearly, the
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   155
notation involving $\ZERO$s and $\ONE$s already suggests
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   156
simplification rules that can be applied to regular regular
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   157
expressions, for example $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   158
\Rightarrow r$, $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   159
r$. While such simple-minded simplifications have been proved in our
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   160
earlier work to preserve the correctness of Sulzmann and Lu's
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   161
algorithm \cite{AusafDyckhoffUrban2016}, they unfortunately do
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   162
\emph{not} help with limiting the growth of the derivatives shown
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   163
above: the growth is slowed, but some derivatives can still grow rather
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   164
quickly beyond any finite bound.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   165
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   166
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   167
Sulzmann and Lu address this ``growth problem'' in a second algorithm
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   168
\cite{Sulzmann2014} where they introduce bitcoded
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   169
regular expressions. In this version, POSIX values are
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   170
represented as bitsequences and such sequences are incrementally generated
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   171
when derivatives are calculated. The compact representation
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   172
of bitsequences and regular expressions allows them to define a more
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   173
``aggressive'' simplification method that keeps the size of the
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   174
derivatives finitely bounded no matter what the length of the string is.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   175
They make some informal claims about the correctness and linear behaviour
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   176
of this version, but do not provide any supporting proof arguments, not
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   177
even ``pencil-and-paper'' arguments. They write about their bitcoded
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   178
\emph{incremental parsing method} (that is the algorithm to be fixed and formalised
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   179
in this paper):
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   180
%
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   181
\begin{quote}\it
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   182
  ``Correctness Claim: We further claim that the incremental parsing
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   183
  method [..] in combination with the simplification steps [..]
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   184
  yields POSIX parse trees. We have tested this claim
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   185
  extensively [..] but yet
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   186
  have to work out all proof details.'' \cite[Page 14]{Sulzmann2014}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   187
\end{quote}  
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   188
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   189
\noindent{}\textbf{Contributions:} We fill this gap by implementing in Isabelle/HOL
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   190
our version of the derivative-based lexing algorithm of Sulzmann and Lu
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   191
\cite{Sulzmann2014} where regular expressions are annotated with
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   192
bitsequences. We define the crucial simplification function as a
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   193
recursive function, without the need of a fixpoint operation. One objective of
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   194
the simplification function is to remove duplicates of regular
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   195
expressions.  For this Sulzmann and Lu use in their paper the standard
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   196
@{text nub} function from Haskell's list library, but this function
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   197
does not achieve the intended objective with bitcoded regular expressions.  The
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   198
reason is that in the bitcoded setting, each copy generally has a
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   199
different bitcode annotation---so @{text nub} would never ``fire''.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   200
Inspired by Scala's library for lists, we shall instead use a @{text
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   201
distinctWith} function that finds duplicates under an ``erasing'' function
616
8907d4b6316d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 615
diff changeset
   202
that deletes bitcodes before comparing regular expressions.
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   203
We shall also introduce our \emph{own} arguments and definitions for
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   204
establishing the correctness of the bitcoded algorithm when 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   205
simplifications are included. Finally we
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   206
establish that the size of derivatives can be finitely bounded.\medskip%\footnote{ In this paper, we shall first briefly introduce the basic notions
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   207
%of regular expressions and describe the definition
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   208
%of POSIX lexing from our earlier work \cite{AusafDyckhoffUrban2016}. This serves
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   209
%as a reference point for what correctness means in our Isabelle/HOL proofs. We shall then prove
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   210
%the correctness for the bitcoded algorithm without simplification, and
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   211
%after that extend the proof to include simplification.}\smallskip
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   212
%\mbox{}\\[-10mm]
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   213
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   214
\noindent In this paper, we shall first briefly introduce the basic notions
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   215
of regular expressions and describe the definition
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   216
of POSIX lexing from our earlier work \cite{AusafDyckhoffUrban2016}. This serves
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   217
as a reference point for what correctness means in our Isabelle/HOL proofs. We shall then prove
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   218
the correctness for the bitcoded algorithm without simplification, and
643
9580bae0500d updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 642
diff changeset
   219
after that extend the proof to include simplification.
9580bae0500d updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 642
diff changeset
   220
Our Isabelle code including the results from Sec.~5 is available
9580bae0500d updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 642
diff changeset
   221
from \textcolor{darkblue}{\url{https://github.com/urbanchr/posix}}.
9580bae0500d updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 642
diff changeset
   222
\mbox{}\\[-6mm]
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   223
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   224
*}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   225
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   226
section {* Background *}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   227
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   228
text {*
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   229
  In our Isabelle/HOL formalisation strings are lists of characters with
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   230
  the empty string being represented by the empty list, written $[]$,
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   231
  and list-cons being written as $\_\!::\!\_\,$; string
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   232
  concatenation is $\_ \,@\, \_\,$. We often use the usual
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   233
  bracket notation for lists also for strings; for example a string
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   234
  consisting of just a single character $c$ is written $[c]$.   
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   235
  Our regular expressions are defined as the following inductive
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   236
  datatype:\\[-4mm]
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   237
  %
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   238
  \begin{center}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   239
  @{text "r ::="} \;
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   240
  @{const "ZERO"} $\mid$
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   241
  @{const "ONE"} $\mid$
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   242
  @{term "CH c"} $\mid$
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   243
  @{term "ALT r\<^sub>1 r\<^sub>2"} $\mid$
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   244
  @{term "SEQ r\<^sub>1 r\<^sub>2"} $\mid$
563
c92a41d9c4da updated paper and theories to include n-times
Christian Urban <christian.urban@kcl.ac.uk>
parents: 502
diff changeset
   245
  @{term "STAR r"} $\mid$
c92a41d9c4da updated paper and theories to include n-times
Christian Urban <christian.urban@kcl.ac.uk>
parents: 502
diff changeset
   246
  @{term "NTIMES r n"}
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   247
  \end{center}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   248
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   249
  \noindent where @{const ZERO} stands for the regular expression that
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   250
  does not match any string, @{const ONE} for the regular expression
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   251
  that matches only the empty string and @{term c} for matching a
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   252
  character literal.  The constructors $+$ and $\cdot$ represent
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   253
  alternatives and sequences, respectively.  We sometimes omit the
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   254
  $\cdot$ in a sequence regular expression for brevity.  The
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   255
  \emph{language} of a regular expression, written $L(r)$, is defined
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   256
  as usual and we omit giving the definition here (see for example
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   257
  \cite{AusafDyckhoffUrban2016}).
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   258
  
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   259
  In our work here we also add to the usual ``basic'' regular
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   260
  expressions the \emph{bounded} regular expression @{term "NTIMES r
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   261
  n"} where the @{term n} specifies that @{term r} should match
643
9580bae0500d updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 642
diff changeset
   262
  exactly @{term n}-times (it is not included in Sulzmann and Lu's original work). For brevity we omit the other bounded
644
9f984ff20020 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 643
diff changeset
   263
  regular expressions @{text "r"}$^{\{..\textit{n}\}}$,
9f984ff20020 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 643
diff changeset
   264
  @{text "r"}$^{\{\textit{n}..\}}$
9f984ff20020 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 643
diff changeset
   265
  and @{text "r"}$^{\{\textit{n}..\textit{m}\}}$ which specify intervals for how many
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   266
  times @{text r} should match. The results presented in this paper
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   267
  extend straightforwardly to them too. The importance of the bounded
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   268
  regular expressions is that they are often used in practical
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   269
  applications, such as Snort (a system for detecting network
599
a5f666410101 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 578
diff changeset
   270
  intrusions) and also in XML Schema definitions. According to Bj\"{o}rklund et
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   271
  al~\cite{BjorklundMartensTimm2015}, bounded regular expressions 
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   272
  occur frequently in the latter and can have counters of up to
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   273
  ten million.  The problem is that tools based on the classic notion
644
9f984ff20020 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 643
diff changeset
   274
  of automata need to expand @{text "r"}$^{\{\textit{n}\}}$ into @{text n}
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   275
  connected copies of the automaton for @{text r}. This leads to very
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   276
  inefficient matching algorithms or algorithms that consume large
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   277
  amounts of memory.  A classic example is the regular expression
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   278
  \mbox{@{term "SEQ (SEQ (STAR (ALT a b)) a) (NTIMES (ALT a b) n)"}}
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   279
  where the minimal DFA requires at least $2^{n + 1}$ states (see
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   280
  \cite{CountingSet2020}). Therefore regular expression matching
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   281
  libraries that rely on the classic notion of DFAs often impose 
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   282
  adhoc limits for bounded regular expressions: For example in the
571
a76458dd9e4c added paper about counting automata
Christian Urban <christian.urban@kcl.ac.uk>
parents: 569
diff changeset
   283
  regular expression matching library in the Go language and also in Google's RE2 library the regular expression
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   284
  @{term "NTIMES a 1001"} is not permitted, because no counter can be
643
9580bae0500d updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 642
diff changeset
   285
  above 1000; and in the regular expression library in Rust
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   286
  expressions such as @{text "a\<^bsup>{1000}{100}{5}\<^esup>"} give an error
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   287
  message for being too big. Up until recently,\footnote{up until version 1.5.4 of the regex
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   288
  library in Rust; see also CVE-2022-24713.} Rust
643
9580bae0500d updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 642
diff changeset
   289
  however happily generated automata for regular expressions such as 
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   290
  @{text "a\<^bsup>{0}{4294967295}\<^esup>"}. This was due to a bug
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   291
  in the algorithm that decides when a regular expression is acceptable
643
9580bae0500d updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 642
diff changeset
   292
  or too big according to Rust's classification (it did not account for the fact that @{text "a\<^bsup>{0}\<^esup>"} and similar examples can match the empty string). We shall come back to
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   293
  this example later in the paper. 
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   294
  These problems can of course be solved in matching
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   295
  algorithms where automata go beyond the classic notion and for
643
9580bae0500d updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 642
diff changeset
   296
  instance include explicit counters (e.g.~\cite{CountingSet2020}).
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   297
  The point here is that Brzozowski derivatives and the algorithms by
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   298
  Sulzmann and Lu can be straightforwardly extended to deal with
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   299
  bounded regular expressions and moreover the resulting code
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   300
  still consists of only simple recursive functions and inductive
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   301
  datatypes. Finally, bounded regular expressions 
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   302
  do not destroy our finite boundedness property, which we shall
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   303
  prove later on.
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   304
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   305
  %, because during the lexing process counters will only be
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   306
  %decremented.
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   307
  
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   308
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   309
  Central to Brzozowski's regular expression matcher are two functions
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   310
  called @{text nullable} and \emph{derivative}. The latter is written
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   311
  $r\backslash c$ for the derivative of the regular expression $r$
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   312
  w.r.t.~the character $c$. Both functions are defined by recursion over
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   313
  regular expressions.  
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   314
%
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   315
\begin{center}
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   316
\begin{tabular}{c @ {\hspace{-9mm}}c}
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   317
  \begin{tabular}{r@ {\hspace{2mm}}c@ {\hspace{2mm}}l}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   318
  @{thm (lhs) der.simps(1)} & $\dn$ & @{thm (rhs) der.simps(1)}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   319
  @{thm (lhs) der.simps(2)} & $\dn$ & @{thm (rhs) der.simps(2)}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   320
  @{thm (lhs) der.simps(3)} & $\dn$ & @{thm (rhs) der.simps(3)}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   321
  @{thm (lhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   322
  @{thm (lhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{text "if"} @{term "nullable(r\<^sub>1)"}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   323
  & & @{text "then"} @{term "ALT (SEQ (der c r\<^sub>1) r\<^sub>2) (der c r\<^sub>2)"}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   324
  & & @{text "else"} @{term "SEQ (der c r\<^sub>1) r\<^sub>2"}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   325
  % & & @{thm (rhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]}\\
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   326
  @{thm (lhs) der.simps(6)} & $\dn$ & @{thm (rhs) der.simps(6)}\\
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   327
  @{thm (lhs) der.simps(7)} & $\dn$ & @{thm (rhs) der.simps(7)}
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   328
  \end{tabular}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   329
  &
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   330
  \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   331
  @{thm (lhs) nullable.simps(1)} & $\dn$ & @{thm (rhs) nullable.simps(1)}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   332
  @{thm (lhs) nullable.simps(2)} & $\dn$ & @{thm (rhs) nullable.simps(2)}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   333
  @{thm (lhs) nullable.simps(3)} & $\dn$ & @{thm (rhs) nullable.simps(3)}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   334
  @{thm (lhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   335
  @{thm (lhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   336
  @{thm (lhs) nullable.simps(6)} & $\dn$ & @{thm (rhs) nullable.simps(6)}\\
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   337
  @{thm (lhs) nullable.simps(7)} & $\dn$ & @{text "if"} @{text "n = 0"}\\
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   338
  & & @{text "then"} @{const "True"}\\
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   339
  & & @{text "else"} @{term "nullable r"}
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   340
  \end{tabular}  
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   341
\end{tabular}  
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   342
\end{center}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   343
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   344
  \noindent
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   345
  We can extend this definition to give derivatives w.r.t.~strings,
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   346
  namely as @{thm (lhs) ders.simps(1)}$\dn$@{thm (rhs) ders.simps(1)}
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   347
  and @{thm (lhs) ders.simps(2)}$\dn$@{thm (rhs) ders.simps(2)}.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   348
  Using @{text nullable} and the derivative operation, we can
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   349
define a simple regular expression matcher, namely
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   350
$@{text "match s r"} \dn @{term nullable}(r\backslash s)$.
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   351
This is essentially Brzozowski's algorithm from 1964. Its
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   352
main virtue is that the algorithm can be easily implemented as a
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   353
functional program (either in a functional programming language or in
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   354
a theorem prover). The correctness of @{text match} amounts to
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   355
establishing the property:%\footnote{It is a fun exercise to formally prove this property in a theorem prover.}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   356
%
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   357
\begin{proposition}\label{matchcorr} 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   358
@{text "match s r"} \;\;\text{if and only if}\;\; $s \in L(r)$
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   359
\end{proposition}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   360
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   361
\noindent
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   362
It is a fun exercise to formally prove this property in a theorem prover.
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   363
We are aware
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   364
of a mechanised correctness proof of Brzozowski's derivative-based matcher in HOL4 by
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   365
Owens and Slind~\cite{Owens2008}. Another one in Isabelle/HOL is part
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   366
of the work by Krauss and Nipkow~\cite{Krauss2011}.  And another one
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   367
in Coq is given by Coquand and Siles \cite{Coquand2012}.
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   368
Also Ribeiro and Du Bois give one in Agda~\cite{RibeiroAgda2017}.
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   369
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   370
The novel idea of Sulzmann and Lu is to extend this algorithm for 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   371
lexing, where it is important to find out which part of the string
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   372
is matched by which part of the regular expression.
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   373
For this Sulzmann and Lu presented two lexing algorithms in their
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   374
paper~\cite{Sulzmann2014}. The first algorithm consists of two phases: first a
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   375
  matching phase (which is Brzozowski's algorithm) and then a value
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   376
  construction phase. The values encode \emph{how} a regular expression
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   377
  matches a string. \emph{Values} are defined as the inductive datatype
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   378
  %
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   379
  \begin{center}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   380
  @{text "v ::="}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   381
  @{const "Void"} $\mid$
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   382
  @{term "val.Char c"} $\mid$
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   383
  @{term "Left v"} $\mid$
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   384
  @{term "Right v"} $\mid$
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   385
  @{term "Seq v\<^sub>1 v\<^sub>2"} $\mid$ 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   386
  @{term "Stars vs"} 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   387
  \end{center}  
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   388
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   389
  \noindent where we use @{term vs} to stand for a list of values. The
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   390
  string underlying a value can be calculated by a @{const flat}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   391
  function, written @{term "flat DUMMY"}. It traverses a value and
616
8907d4b6316d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 615
diff changeset
   392
  collects the characters contained in it (see \cite{AusafDyckhoffUrban2016}).
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   393
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   394
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   395
  Sulzmann and Lu also define inductively an
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   396
  inhabitation relation that associates values to regular expressions. Our
616
8907d4b6316d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 615
diff changeset
   397
  version of this relation is defined by the following six rules:
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   398
  %
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   399
  \begin{center}
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   400
  \begin{tabular}{@ {}l@ {}}
644
9f984ff20020 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 643
diff changeset
   401
  @{thm[mode=Axiom] Prf.intros(4)}\qquad
9f984ff20020 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 643
diff changeset
   402
  @{thm[mode=Rule] Prf.intros(2)[of "v\<^sub>1" "r\<^sub>1" "r\<^sub>2"]}\qquad
9f984ff20020 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 643
diff changeset
   403
  @{thm[mode=Rule] Prf.intros(3)[of "v\<^sub>2" "r\<^sub>2" "r\<^sub>1"]}\qquad
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   404
  @{thm[mode=Rule] Prf.intros(1)[of "v\<^sub>1" "r\<^sub>1" "v\<^sub>2" "r\<^sub>2"]}\medskip\\
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   405
  @{thm[mode=Axiom] Prf.intros(5)[of "c"]}\qquad
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   406
  @{thm[mode=Rule] Prf.intros(6)[of "vs" "r"]}\qquad
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   407
  $\mprset{flushleft}\inferrule{
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   408
  @{thm (prem 1) Prf.intros(7)[of "vs\<^sub>1" "r"  "vs\<^sub>2" "n"]}\\\\
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   409
  @{thm (prem 2) Prf.intros(7)[of "vs\<^sub>1" "r"  "vs\<^sub>2" "n"]}\quad
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   410
  @{thm (prem 3) Prf.intros(7)[of "vs\<^sub>1" "r"  "vs\<^sub>2" "n"]}
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   411
  }
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   412
  {@{thm (concl) Prf.intros(7)[of "vs\<^sub>1" "r"  "vs\<^sub>2" "n"]}}
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   413
  $
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   414
  \end{tabular}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   415
  \end{center}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   416
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   417
  \noindent Note that no values are associated with the regular expression
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   418
  @{term ZERO}, since it cannot match any string. Interesting is our version
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   419
  of the rule for @{term "STAR r"} where we require that each value
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   420
  in  @{term vs} flattens to a non-empty string. This means if @{term "STAR r"} ``fires''
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   421
  one or more times, then each copy needs to match a non-empty string.
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   422
  Similarly, in the rule
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   423
  for @{term "NTIMES r n"} we require that the length of the list
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   424
  @{term "vs\<^sub>1 @ vs\<^sub>2"} equals @{term n} (meaning the regular expression
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   425
  @{text r} matches @{text n}-times) and that the first segment of this list
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   426
  contains values that flatten to non-empty strings followed by a segment that
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   427
  only contains values that flatten to the empty string. 
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   428
  It is routine to establish how values ``inhabiting'' a regular
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   429
  expression correspond to the language of a regular expression, namely
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   430
  @{thm L_flat_Prf}.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   431
616
8907d4b6316d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 615
diff changeset
   432
  In general there is more than one value inhabiting a regular
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   433
  expression (meaning regular expressions can typically match more
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   434
  than one string). But even when fixing a string from the language of the
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   435
  regular expression, there are generally more than one way of how the
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   436
  regular expression can match this string. POSIX lexing is about
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   437
  identifying the unique value for a given regular expression and a
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   438
  string that satisfies the informal POSIX rules (see
499
6a100d32314c updated the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 498
diff changeset
   439
  \cite{POSIX,Kuklewicz,OkuiSuzuki2010,Sulzmann2014,Vansummeren2006}).
6a100d32314c updated the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 498
diff changeset
   440
  %\footnote{POSIX
6a100d32314c updated the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 498
diff changeset
   441
  %	lexing acquired its name from the fact that the corresponding
6a100d32314c updated the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 498
diff changeset
   442
  %	rules were described as part of the POSIX specification for
6a100d32314c updated the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 498
diff changeset
   443
  %	Unix-like operating systems \cite{POSIX}.}
6a100d32314c updated the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 498
diff changeset
   444
  Sometimes these
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   445
  informal rules are called \emph{maximal munch rule} and \emph{rule priority}.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   446
  One contribution of our earlier paper is to give a convenient
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   447
 specification for what POSIX values are (the inductive rules are shown in
616
8907d4b6316d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 615
diff changeset
   448
  Fig~\ref{POSIXrules}).
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   449
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   450
\begin{figure}[t]
499
6a100d32314c updated the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 498
diff changeset
   451
  \begin{center}\small%
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   452
  \begin{tabular}{@ {\hspace{-2mm}}c@ {}}
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   453
  \\[-8.5mm]
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   454
  @{thm[mode=Axiom] Posix.intros(1)}\<open>P\<close>@{term "ONE"} \quad
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   455
  @{thm[mode=Axiom] Posix.intros(2)}\<open>P\<close>@{term "c"}\quad
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   456
  @{thm[mode=Rule] Posix.intros(3)[of "s" "r\<^sub>1" "v" "r\<^sub>2"]}\<open>P+L\<close>\quad
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   457
  @{thm[mode=Rule] Posix.intros(4)[of "s" "r\<^sub>2" "v" "r\<^sub>1"]}\<open>P+R\<close>\medskip\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   458
  $\mprset{flushleft}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   459
   \inferrule
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   460
   {@{thm (prem 1) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \qquad
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   461
    @{thm (prem 2) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \qquad
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   462
    @{thm (prem 3) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   463
   {@{thm (concl) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}$\<open>PS\<close>\medskip\\
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   464
  @{thm[mode=Axiom] Posix.intros(7)}\<open>P[]\<close>\qquad
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   465
  $\mprset{flushleft}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   466
   \inferrule
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   467
   {@{thm (prem 1) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   468
    @{thm (prem 2) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   469
    @{thm (prem 3) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \\\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   470
    @{thm (prem 4) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   471
   {@{thm (concl) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}$\<open>P\<star>\<close>\medskip\\
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   472
  \mprset{sep=4mm} 
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   473
  @{thm[mode=Rule] Posix.intros(9)}\<open>Pn[]\<close>\quad\; 
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   474
  $\mprset{flushleft}
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   475
   \inferrule
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   476
   {@{thm (prem 1) Posix.intros(8)[of "s\<^sub>1" "r" "v" "s\<^sub>2" n "vs"]} \qquad
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   477
    @{thm (prem 2) Posix.intros(8)[of "s\<^sub>1" "r" "v" "s\<^sub>2" n "vs"]} \qquad
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   478
    @{thm (prem 3) Posix.intros(8)[of "s\<^sub>1" "r" "v" "s\<^sub>2" n "vs"]} \\\\
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   479
    @{thm (prem 4) Posix.intros(8)[of "s\<^sub>1" "r" "v" "s\<^sub>2" n "vs"]}}
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   480
   {@{thm (concl) Posix.intros(8)[of "s\<^sub>1" "r" "v" "s\<^sub>2" n "vs"]}}$\<open>Pn+\<close>
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   481
  \\[-4mm]   
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   482
  \end{tabular}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   483
  \end{center}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   484
  \caption{The inductive definition of POSIX values taken from our earlier paper \cite{AusafDyckhoffUrban2016}. The ternary relation, written $(s, r) \rightarrow v$, formalises the notion
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   485
  of given a string $s$ and a regular
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   486
  expression $r$ what is the unique value $v$ that satisfies the informal POSIX constraints for
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   487
  regular expression matching.\smallskip}\label{POSIXrules}
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   488
  \end{figure}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   489
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   490
  The clever idea by Sulzmann and Lu \cite{Sulzmann2014} in their first algorithm is to define
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   491
  an injection function on values that mirrors (but inverts) the
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   492
  construction of the derivative on regular expressions. Essentially it
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   493
  injects back a character into a value.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   494
  For this they define two functions called @{text mkeps} and @{text inj}:
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   495
 %
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   496
  \begin{center}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   497
  \begin{tabular}{@ {}l@ {}}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   498
  \begin{tabular}{@ {}lcl@ {}}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   499
  @{thm (lhs) mkeps.simps(1)} & $\dn$ & @{thm (rhs) mkeps.simps(1)}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   500
  @{thm (lhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   501
  @{thm (lhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   502
  @{thm (lhs) mkeps.simps(4)} & $\dn$ & @{thm (rhs) mkeps.simps(4)}\\
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   503
  @{thm (lhs) mkeps.simps(5)} & $\dn$ & @{thm (rhs) mkeps.simps(5)}
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   504
  \end{tabular}
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   505
  %\end{tabular}
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   506
  %\end{center}
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   507
  \smallskip\\
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   508
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   509
  %\begin{center}
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   510
  %\begin{tabular}{@ {}cc@ {}}
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   511
  \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}@ {}}
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   512
  @{thm (lhs) injval.simps(1)} & $\dn$ & @{thm (rhs) injval.simps(1)[of "c"]}\\
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   513
  @{thm (lhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]} & $\dn$ & 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   514
      @{thm (rhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   515
  @{thm (lhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$ & 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   516
      @{thm (rhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   517
  %!
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   518
  @{thm (lhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$ 
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   519
      & @{thm (rhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   520
  @{thm (lhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$ 
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   521
      & @{thm (rhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   522
  @{thm (lhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$ &
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   523
  @{thm (rhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   524
  %!
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   525
  @{thm (lhs) injval.simps(7)[of "r" "c" "v" "vs"]} & $\dn$ 
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   526
      & @{thm (rhs) injval.simps(7)[of "r" "c" "v" "vs"]}\\
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   527
  @{thm (lhs) injval.simps(8)[of "r" "n" "c" "v" "vs"]} & $\dn$ 
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   528
      & @{thm (rhs) injval.simps(8)[of "r" "n" "c" "v" "vs"]}
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   529
  \end{tabular}
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   530
  %!&
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   531
  %!\begin{tabular}{@ {\hspace{-3mm}}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}@ {}}
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   532
  
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   533
  %!\end{tabular}
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   534
  %!\end{tabular}
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   535
  \end{tabular}%!\smallskip
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   536
  \end{center}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   537
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   538
  \noindent
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   539
  The function @{text mkeps} is run when the last derivative is nullable, that is
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   540
  the string to be matched is in the language of the regular expression. It generates
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   541
  a value for how the last derivative can match the empty string. In case
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   542
  of @{term "NTIMES r n"} we use the function @{term replicate} in order to generate
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   543
  a list of exactly @{term n} copies, which is the length of the list we expect in this
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   544
  case.  The injection function\footnote{While the character argument @{text c} is not
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   545
  strictly necessary in the @{text inj}-function for the fragment of regular expressions we
644
9f984ff20020 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 643
diff changeset
   546
  use in this paper, it is necessary for extended regular expressions. For example for the range regular expression of the form @{text "[a-z]"}.
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   547
  We therefore keep this argument from the original formulation of @{text inj} by Sulzmann and Lu.}
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   548
  then calculates the corresponding value for each intermediate derivative until
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   549
  a value for the original regular expression is generated.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   550
  Graphically the algorithm by
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   551
  Sulzmann and Lu can be illustrated by the following picture %in Figure~\ref{Sulz}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   552
  where the path from the left to the right involving @{term derivatives}/@{const
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   553
  nullable} is the first phase of the algorithm (calculating successive
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   554
  \Brz's derivatives) and @{const mkeps}/@{text inj}, the path from right to
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   555
  left, the second phase.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   556
%
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   557
\begin{center}
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   558
\begin{tikzpicture}[scale=0.85,node distance=8mm,
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   559
                    every node/.style={minimum size=6mm}]
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   560
\node (r1)  {@{term "r\<^sub>1"}};
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   561
\node (r2) [right=of r1]{@{term "r\<^sub>2"}};
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   562
\draw[->,line width=1mm](r1)--(r2) node[above,midway] {@{term "der a DUMMY"}};
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   563
\node (r3) [right=of r2]{@{term "r\<^sub>3"}};
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   564
\draw[->,line width=1mm](r2)--(r3) node[above,midway] {@{term "der b DUMMY"}};
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   565
\node (r4) [right=of r3]{@{term "r\<^sub>4"}};
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   566
\draw[->,line width=1mm](r3)--(r4) node[above,midway] {@{term "der c DUMMY"}};
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   567
\draw (r4) node[anchor=west] {\;\raisebox{3mm}{@{term nullable}}};
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   568
\node (v4) [below=of r4]{@{term "v\<^sub>4"}};
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   569
\draw[->,line width=1mm](r4) -- (v4);
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   570
\node (v3) [left=of v4] {@{term "v\<^sub>3"}};
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   571
\draw[->,line width=1mm](v4)--(v3) node[below,midway] {\<open>inj r\<^sub>3 c\<close>};
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   572
\node (v2) [left=of v3]{@{term "v\<^sub>2"}};
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   573
\draw[->,line width=1mm](v3)--(v2) node[below,midway] {\<open>inj r\<^sub>2 b\<close>};
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   574
\node (v1) [left=of v2] {@{term "v\<^sub>1"}};
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   575
\draw[->,line width=1mm](v2)--(v1) node[below,midway] {\<open>inj r\<^sub>1 a\<close>};
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   576
\draw (r4) node[anchor=north west] {\;\raisebox{-5mm}{@{term "mkeps"}}};
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   577
\end{tikzpicture}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   578
\end{center}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   579
%
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   580
\noindent
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   581
  The picture shows the steps required when a
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   582
  regular expression, say @{text "r\<^sub>1"}, matches the string @{term
499
6a100d32314c updated the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 498
diff changeset
   583
  "[a,b,c]"}. The first lexing algorithm by Sulzmann and Lu can be defined as:%\\[-8mm]
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   584
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   585
%  \begin{figure}[t]
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   586
%\begin{center}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   587
%\begin{tikzpicture}[scale=1,node distance=1cm,
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   588
%                    every node/.style={minimum size=6mm}]
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   589
%\node (r1)  {@{term "r\<^sub>1"}};
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   590
%\node (r2) [right=of r1]{@{term "r\<^sub>2"}};
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   591
%\draw[->,line width=1mm](r1)--(r2) node[above,midway] {@{term "der a DUMMY"}};
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   592
%\node (r3) [right=of r2]{@{term "r\<^sub>3"}};
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   593
%\draw[->,line width=1mm](r2)--(r3) node[above,midway] {@{term "der b DUMMY"}};
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   594
%\node (r4) [right=of r3]{@{term "r\<^sub>4"}};
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   595
%\draw[->,line width=1mm](r3)--(r4) node[above,midway] {@{term "der c DUMMY"}};
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   596
%\draw (r4) node[anchor=west] {\;\raisebox{3mm}{@{term nullable}}};
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   597
%\node (v4) [below=of r4]{@{term "v\<^sub>4"}};
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   598
%\draw[->,line width=1mm](r4) -- (v4);
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   599
%\node (v3) [left=of v4] {@{term "v\<^sub>3"}};
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   600
%\draw[->,line width=1mm](v4)--(v3) node[below,midway] {\<open>inj r\<^sub>3 c\<close>};
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   601
%\node (v2) [left=of v3]{@{term "v\<^sub>2"}};
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   602
%\draw[->,line width=1mm](v3)--(v2) node[below,midway] {\<open>inj r\<^sub>2 b\<close>};
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   603
%\node (v1) [left=of v2] {@{term "v\<^sub>1"}};
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   604
%\draw[->,line width=1mm](v2)--(v1) node[below,midway] {\<open>inj r\<^sub>1 a\<close>};
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   605
%\draw (r4) node[anchor=north west] {\;\raisebox{-8mm}{@{term "mkeps"}}};
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   606
%\end{tikzpicture}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   607
%\end{center}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   608
%\mbox{}\\[-13mm]
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   609
%
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   610
%\caption{The two phases of the first algorithm by Sulzmann \& Lu \cite{Sulzmann2014},
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   611
%matching the string @{term "[a,b,c]"}. The first phase (the arrows from 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   612
%left to right) is \Brz's matcher building successive derivatives. If the 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   613
%last regular expression is @{term nullable}, then the functions of the 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   614
%second phase are called (the top-down and right-to-left arrows): first 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   615
%@{term mkeps} calculates a value @{term "v\<^sub>4"} witnessing
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   616
%how the empty string has been recognised by @{term "r\<^sub>4"}. After
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   617
%that the function @{term inj} ``injects back'' the characters of the string into
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   618
%the values. The value @{term "v\<^sub>1"} is the result of the algorithm representing
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   619
%the POSIX value for this string and
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   620
%regular expression.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   621
%\label{Sulz}}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   622
%\end{figure} 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   623
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   624
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   625
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   626
  \begin{center}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   627
  \begin{tabular}{lcl}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   628
  @{thm (lhs) lexer.simps(1)} & $\dn$ & @{thm (rhs) lexer.simps(1)}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   629
  @{thm (lhs) lexer.simps(2)} & $\dn$ & @{text "case"} @{term "lexer (der c r) s"} @{text of}
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   630
                     @{term "None"}  @{text "\<Rightarrow>"} @{term None}
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   631
                     %%& & \hspace{27mm}
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   632
		     $|\;$ @{term "Some v"} @{text "\<Rightarrow>"} @{term "Some (injval r c v)"}                          
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   633
  \end{tabular}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   634
  \end{center}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   635
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   636
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   637
We have shown in our earlier paper \cite{AusafDyckhoffUrban2016} that
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   638
this algorithm is correct, that is it generates POSIX values. The
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   639
central property we established relates the derivative operation to the
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   640
injection function.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   641
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   642
  \begin{proposition}\label{Posix2}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   643
	\textit{If} $(s,\; r\backslash c) \rightarrow v$ \textit{then} $(c :: s,\; r) \rightarrow$ \textit{inj} $r\; c\; v$. 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   644
\end{proposition}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   645
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   646
  \noindent
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   647
  With this in place we were able to prove:
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   648
499
6a100d32314c updated the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 498
diff changeset
   649
  \begin{proposition}\mbox{}\label{lexercorrect}
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   650
  \textrm{(1)}\; @{thm (lhs) lexer_correct_None} if and only if @{thm (rhs) lexer_correct_None}.\\
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   651
  \mbox{\hspace{29.5mm}}\textrm{(2)}\; @{thm (lhs) lexer_correct_Some} if and only if @{thm (rhs) lexer_correct_Some}.
499
6a100d32314c updated the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 498
diff changeset
   652
  %
6a100d32314c updated the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 498
diff changeset
   653
  % \smallskip\\
6a100d32314c updated the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 498
diff changeset
   654
  %\begin{tabular}{ll}
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   655
  %(1) & @{thm (lhs) lexer_correct_None} \;if and only if\; @{thm (rhs) lexer_correct_None}\\
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   656
  %(2) & @{thm (lhs) lexer_correct_Some} \;if and only if\; @{thm (rhs) lexer_correct_Some}\\
499
6a100d32314c updated the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 498
diff changeset
   657
  %\end{tabular}
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   658
  \end{proposition}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   659
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   660
  \noindent
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   661
  In fact we have shown that, in the success case, the generated POSIX value $v$ is
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   662
  unique and in the failure case that there is no POSIX value $v$ that satisfies
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   663
  $(s, r) \rightarrow v$. While the algorithm is correct, it is excruciatingly
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   664
  slow in cases where the derivatives grow arbitrarily (recall the example from the
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   665
  Introduction). However it can be used as a convenient reference point for the correctness
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   666
  proof of the second algorithm by Sulzmann and Lu, which we shall describe next.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   667
  
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   668
*}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   669
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   670
section {* Bitcoded Regular Expressions and Derivatives *}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   671
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   672
text {*
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   673
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   674
  In the second part of their paper \cite{Sulzmann2014},
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   675
  Sulzmann and Lu describe another algorithm that also generates POSIX
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   676
  values but dispenses with the second phase where characters are
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   677
  injected ``back'' into values. For this they annotate bitcodes to
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   678
  regular expressions, which we define in Isabelle/HOL as the datatype\medskip
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   679
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   680
  \begin{center}
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   681
  %\noindent
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   682
  %\begin{minipage}{0.9\textwidth}
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   683
  \,@{term breg} $\,::=\,$ @{term "AZERO"}
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   684
               $\,\mid$ @{term "AONE bs"}
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   685
               $\,\mid$ @{term "ACHAR bs c"}
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   686
               $\,\mid$ @{term "AALTs bs rs"}
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   687
               $\,\mid$ @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   688
               $\,\mid$ @{term "ASTAR bs r"}
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   689
	       $\,\mid$ \mbox{@{term "ANTIMES bs r n"}}%\hfill\mbox{}
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   690
  %\end{minipage}\medskip	       
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   691
  \end{center}
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   692
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   693
  \noindent where @{text bs} stands for bitsequences; @{text r},
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   694
  @{text "r\<^sub>1"} and @{text "r\<^sub>2"} for bitcoded regular
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   695
  expressions; and @{text rs} for lists of bitcoded regular
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   696
  expressions. The binary alternative @{text "ALT bs r\<^sub>1 r\<^sub>2"}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   697
  is just an abbreviation for \mbox{@{text "ALTs bs [r\<^sub>1, r\<^sub>2]"}}. 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   698
  For bitsequences we use lists made up of the
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   699
  constants @{text Z} and @{text S}.  The idea with bitcoded regular
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   700
  expressions is to incrementally generate the value information (for
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   701
  example @{text Left} and @{text Right}) as bitsequences. For this 
578
e71a6e2aca2d updated paper and literature
Christian Urban <christian.urban@kcl.ac.uk>
parents: 571
diff changeset
   702
  Sulzmann and Lu follow Nielsen and Henglein \cite{NielsenHenglein2011}
e71a6e2aca2d updated paper and literature
Christian Urban <christian.urban@kcl.ac.uk>
parents: 571
diff changeset
   703
  and define a coding
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   704
  function for how values can be coded into bitsequences.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   705
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   706
  \begin{center}
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   707
  \begin{tabular}{c@ {\hspace{5mm}}c}
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   708
  \begin{tabular}{lcl}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   709
  @{thm (lhs) code.simps(1)} & $\dn$ & @{thm (rhs) code.simps(1)}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   710
  @{thm (lhs) code.simps(2)} & $\dn$ & @{thm (rhs) code.simps(2)}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   711
  @{thm (lhs) code.simps(3)} & $\dn$ & @{thm (rhs) code.simps(3)}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   712
  @{thm (lhs) code.simps(4)} & $\dn$ & @{thm (rhs) code.simps(4)}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   713
  \end{tabular}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   714
  &
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   715
  \begin{tabular}{lcl}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   716
  @{thm (lhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]} & $\dn$ & @{thm (rhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   717
  @{thm (lhs) code.simps(6)} & $\dn$ & @{thm (rhs) code.simps(6)}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   718
  @{thm (lhs) code.simps(7)} & $\dn$ & @{thm (rhs) code.simps(7)}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   719
  \mbox{\phantom{XX}}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   720
  \end{tabular}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   721
  \end{tabular}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   722
  \end{center}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   723
   
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   724
  \noindent
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   725
  As can be seen, this coding is ``lossy'' in the sense that it does not
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   726
  record explicitly character values and also not sequence values (for
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   727
  them it just appends two bitsequences). However, the
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   728
  different alternatives for @{text Left}, respectively @{text Right}, are recorded as @{text Z} and
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   729
  @{text S} followed by some bitsequence. Similarly, we use @{text Z} to indicate
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   730
  if there is still a value coming in the list of @{text Stars}, whereas @{text S}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   731
  indicates the end of the list. The lossiness makes the process of
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   732
  decoding a bit more involved, but the point is that if we have a
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   733
  regular expression \emph{and} a bitsequence of a corresponding value,
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   734
  then we can always decode the value accurately~(see Fig.~\ref{decode}).
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   735
  The function \textit{decode} checks whether all of the bitsequence is
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   736
  consumed and returns the corresponding value as @{term "Some v"}; otherwise
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   737
  it fails with @{text "None"}. We can establish that for a value $v$
616
8907d4b6316d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 615
diff changeset
   738
  inhabiting a regular expression $r$, the decoding of its
578
e71a6e2aca2d updated paper and literature
Christian Urban <christian.urban@kcl.ac.uk>
parents: 571
diff changeset
   739
  bitsequence never fails (see also \cite{NielsenHenglein2011}).
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   740
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   741
  %The decoding can be
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   742
  %defined by using two functions called $\textit{decode}'$ and
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   743
  %\textit{decode}:
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   744
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   745
  \begin{figure}[t]
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   746
  \begin{center}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   747
  \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
599
a5f666410101 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 578
diff changeset
   748
  $\textit{decode}'\,bs\,(\ONE)$ & $\;\dn\;$ & $(\Empty, bs)$\\
a5f666410101 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 578
diff changeset
   749
  $\textit{decode}'\,bs\,(c)$ & $\;\dn\;$ & $(\Char\,c, bs)$\\
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   750
  $\textit{decode}'\,(\Z\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   751
     $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}\;
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   752
       (\Left\,v, bs_1)$\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   753
  $\textit{decode}'\,(\S\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   754
     $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_2\;\textit{in}\;
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   755
       (\Right\,v, bs_1)$\\                           
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   756
  $\textit{decode}'\,bs\;(r_1\cdot r_2)$ & $\dn$ &
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   757
        $\textit{let}\,(v_1, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}$\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   758
  & &   $\textit{let}\,(v_2, bs_2) = \textit{decode}'\,bs_1\,r_2$
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   759
        \hspace{2mm}$\textit{in}\;(\Seq\,v_1\,v_2, bs_2)$\\
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   760
  $\textit{decode}'\,(\S\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   761
  $\textit{decode}'\,(\Z\!::\!bs)\,(r^*)$ & $\dn$ & 
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   762
         $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   763
  & &   $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^*$
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   764
        \hspace{2mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\\
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   765
  $\textit{decode}'\,bs\,(r^{\{n\}})$ & $\dn$ & $\textit{decode}'\,bs\,r^*$\smallskip\smallskip\\
599
a5f666410101 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 578
diff changeset
   766
a5f666410101 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 578
diff changeset
   767
  $\textit{decode}\,bs\,r$ & $\dn$ & 
a5f666410101 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 578
diff changeset
   768
     $\textit{let}\,(v, bs') = \textit{decode}'\,bs\,r\;\textit{in}$\\
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   769
  & & $\;\;\;\,\textit{if}\;bs' = []\;\textit{then}\;\textit{Some}\,v\;\textit{else}\;\textit{None}$\\[-5mm]   
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   770
  \end{tabular}    
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   771
  \end{center}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   772
  \caption{Two functions, called $\textit{decode}'$ and \textit{decode}, for decoding a value from a bitsequence with the help of a regular expression.\\[-5mm]}\label{decode}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   773
  \end{figure}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   774
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   775
  %\noindent
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   776
  %The function \textit{decode} checks whether all of the bitsequence is
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   777
  %consumed and returns the corresponding value as @{term "Some v"}; otherwise
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   778
  %it fails with @{text "None"}. We can establish that for a value $v$
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   779
  %inhabited by a regular expression $r$, the decoding of its
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   780
  %bitsequence never fails.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   781
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   782
\begin{lemma}\label{codedecode}\it
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   783
  If $\;\vdash v : r$ then
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   784
  $\;\textit{decode}\,(\textit{code}\, v)\,r = \textit{Some}\, v$.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   785
\end{lemma}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   786
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   787
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   788
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   789
  Sulzmann and Lu define the function \emph{internalise}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   790
  in order to transform (standard) regular expressions into annotated
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   791
  regular expressions. We write this operation as $r^\uparrow$.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   792
  This internalisation uses the following
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   793
  \emph{fuse} function.\medskip
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   794
  
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   795
  %
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   796
  %!\begin{center}
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   797
  \noindent\begin{minipage}{\textwidth}
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   798
  \begin{tabular}{@ {}c@ {\hspace{3mm}}c@ {}}
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   799
  \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   800
  $\textit{fuse}\,bs\,(\textit{ZERO})$ & $\dn$ & $\textit{ZERO}$\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   801
  $\textit{fuse}\,bs\,(\textit{ONE}\,bs')$ & $\dn$ &
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   802
     $\textit{ONE}\,(bs\,@\,bs')$\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   803
  $\textit{fuse}\,bs\,(\textit{CHAR}\,bs'\,c)$ & $\dn$ &
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   804
     $\textit{CHAR}\,(bs\,@\,bs')\,c$
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   805
  \end{tabular}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   806
  &
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   807
  \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   808
  $\textit{fuse}\,bs\,(\textit{ALTs}\,bs'\,rs)$ & $\dn$ &
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   809
     $\textit{ALTs}\,(bs\,@\,bs')\,rs$\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   810
  $\textit{fuse}\,bs\,(\textit{SEQ}\,bs'\,r_1\,r_2)$ & $\dn$ &
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   811
     $\textit{SEQ}\,(bs\,@\,bs')\,r_1\,r_2$\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   812
  $\textit{fuse}\,bs\,(\textit{STAR}\,bs'\,r)$ & $\dn$ &
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   813
     $\textit{STAR}\,(bs\,@\,bs')\,r$\\
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   814
  $\textit{fuse}\,bs\,(\textit{NT}\,bs'\,r\,n)$ & $\dn$ &
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   815
     $\textit{NT}\,(bs\,@\,bs')\,r\,n$   
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   816
  \end{tabular}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   817
  \end{tabular}
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   818
  \end{minipage}\medskip
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   819
  %!\end{center}    
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   820
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   821
  \noindent
616
8907d4b6316d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 615
diff changeset
   822
  This function ``fuses'' a bitsequence to the topmost constructor of an bitcoded regular expressions. 
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   823
  A regular expression can then be \emph{internalised} into a bitcoded
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   824
  regular expression as follows:
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   825
  %
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   826
  %!\begin{center}
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   827
  %!\begin{tabular}{@ {}c@ {\hspace{2mm}}c@ {\hspace{2mm}}c@ {}}
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   828
  %!\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   829
  %!$(\ZERO)^\uparrow$ & $\dn$ & $\textit{ZERO}$\\
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   830
  %!$(\ONE)^\uparrow$ & $\dn$ & $\textit{ONE}\,[]$
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   831
  %!\end{tabular}
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   832
  %!&
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   833
  %!\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   834
  %!$(c)^\uparrow$ & $\dn$ & $\textit{CHAR}\,[]\,c$\\
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   835
  %!$(r^*)^\uparrow$ & $\dn$ & $\textit{STAR}\;[]\,r^\uparrow$\\
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   836
  %!$(r^{\{n\}})^\uparrow$ & $\dn$ &
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   837
  %!      $\textit{NT}\;[]\,r^\uparrow\,n$
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   838
  %!\end{tabular}
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   839
  %!&
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   840
  %!\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   841
  %!$(r_1 + r_2)^\uparrow$ & $\dn$ &
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   842
  %!       $\textit{ALT}\;[]\,(\textit{fuse}\,[\Z]\,r_1^\uparrow)\,
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   843
  %!                          (\textit{fuse}\,[\S]\,r_2^\uparrow)$\\
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   844
  %!$(r_1\cdot r_2)^\uparrow$ & $\dn$ &
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   845
  %!       $\textit{SEQ}\;[]\,r_1^\uparrow\,r_2^\uparrow$
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   846
  %!\end{tabular}
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   847
  %!\end{tabular}
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   848
  %!\end{center}
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   849
  %!
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   850
  \begin{center}
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   851
  \begin{tabular}{@ {}c@ {\hspace{7mm}}c@ {}}
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   852
  \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   853
  $(\ZERO)^\uparrow$ & $\dn$ & $\textit{ZERO}$\\
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   854
  $(\ONE)^\uparrow$ & $\dn$ & $\textit{ONE}\,[]$\\
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   855
  $(c)^\uparrow$ & $\dn$ & $\textit{CHAR}\,[]\,c$\\
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   856
  $(r^*)^\uparrow$ & $\dn$ & $\textit{STAR}\;[]\,r^\uparrow$\\
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   857
  \end{tabular}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   858
  &
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   859
  \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   860
  $(r_1 + r_2)^\uparrow$ & $\dn$ &
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   861
         $\textit{ALT}\;[]\,(\textit{fuse}\,[\Z]\,r_1^\uparrow)\,
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   862
                            (\textit{fuse}\,[\S]\,r_2^\uparrow)$\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   863
  $(r_1\cdot r_2)^\uparrow$ & $\dn$ &
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   864
         $\textit{SEQ}\;[]\,r_1^\uparrow\,r_2^\uparrow$\\
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   865
  $(r^{\{n\}})^\uparrow$ & $\dn$ &
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   866
        $\textit{NT}\;[]\,r^\uparrow\,n$	 
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   867
  \end{tabular}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   868
  \end{tabular}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   869
  \end{center}    
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   870
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   871
  \noindent
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   872
  There is also an \emph{erase}-function, written $r^\downarrow$, which
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   873
  transforms a bitcoded regular expression into a (standard) regular
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   874
  expression by just erasing the annotated bitsequences. We omit the
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   875
  straightforward definition. For defining the algorithm, we also need
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   876
  the functions \textit{bnullable} and \textit{bmkeps}(\textit{s}), which are the
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   877
  ``lifted'' versions of \textit{nullable} and \textit{mkeps} acting on
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   878
  bitcoded regular expressions.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   879
  %
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   880
  \begin{center}
643
9580bae0500d updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 642
diff changeset
   881
  \begin{tabular}{@ {\hspace{-1mm}}c@ {\hspace{10mm}}c@ {}}
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   882
  \begin{tabular}{@ {}l@ {\hspace{0.5mm}}c@ {\hspace{1mm}}l}
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   883
  $\textit{bnullable}\,(\textit{ZERO})$ & $\dn$ & $\textit{False}$\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   884
  $\textit{bnullable}\,(\textit{ONE}\,bs)$ & $\dn$ & $\textit{True}$\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   885
  $\textit{bnullable}\,(\textit{CHAR}\,bs\,c)$ & $\dn$ & $\textit{False}$\\
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   886
  $\textit{bnullable}\,(\textit{ALTs}\,bs\,\rs)$ & $\dn$ &\\
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   887
  \multicolumn{3}{l}{$\quad\exists\, r \in \rs. \,\textit{bnullable}\,r$}\\
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   888
  $\textit{bnullable}\,(\textit{SEQ}\,bs\,r_1\,r_2)$ & $\dn$ &\\
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   889
  \multicolumn{3}{l}{$\quad\textit{bnullable}\,r_1\wedge \textit{bnullable}\,r_2$}\\
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   890
  $\textit{bnullable}\,(\textit{STAR}\,bs\,r)$ & $\dn$ &
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   891
     $\textit{True}$\\
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   892
  $\textit{bnullable}\,(\textit{NT}\,bs\,r\,n)$ & $\dn$ &\\
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   893
  \multicolumn{3}{l}{$\quad\textit{if}\;n = 0\;\textit{then}\;\textit{True}\;
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   894
  \textit{else}\;\textit{bnullable}\,r$}\\
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   895
  \end{tabular}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   896
  &
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   897
  \begin{tabular}{@ {}l@ {\hspace{0.5mm}}c@ {\hspace{1mm}}l@ {}}
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   898
  $\textit{bmkeps}\,(\textit{ONE}\,bs)$ & $\dn$ & $bs$\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   899
  $\textit{bmkeps}\,(\textit{ALTs}\,bs\,\rs)$ & $\dn$ &
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   900
  $bs\,@\,\textit{bmkepss}\,\rs$\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   901
  $\textit{bmkeps}\,(\textit{SEQ}\,bs\,r_1\,r_2)$ & $\dn$ &\\
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   902
  \multicolumn{3}{l}{$\quad{}bs \,@\,\textit{bmkeps}\,r_1\,@\, \textit{bmkeps}\,r_2$}\\
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   903
  $\textit{bmkeps}\,(\textit{STAR}\,bs\,r)$ & $\dn$ &
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   904
     $bs \,@\, [\S]$\\
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   905
   $\textit{bmkeps}\,(\textit{NT}\,bs\,r\,n)$ & $\dn$ &\\
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   906
   \multicolumn{3}{l@ {}}{$\quad\textit{if}\;n=0\;\textit{then}\;bs \,@\, [\S]$}\\
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   907
   \multicolumn{3}{l@ {}}{$\quad\textit{else}\,bs \,@\, [\Z] \,@\, \textit{bmkeps}\,r\,@\,
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   908
        \textit{bmkeps}\,(@{term "ANTIMES [] r (n - 1)"})$}\\   
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   909
  $\textit{bmkepss}\,(r\!::\!\rs)$ & $\dn$ &\\
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   910
  \multicolumn{3}{l}{$\quad\textit{if}\;\textit{bnullable}\,r\,\textit{then}\;\textit{bmkeps}\,r\;
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   911
  \textit{else}\;\textit{bmkepss}\,\rs$}
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   912
  \end{tabular}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   913
  \end{tabular}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   914
  \end{center}    
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   915
 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   916
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   917
  \noindent
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   918
  The key function in the bitcoded algorithm is the derivative of a
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   919
  bitcoded regular expression. This derivative function calculates the
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   920
  derivative but at the same time also the incremental part of the bitsequences
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   921
  that contribute to constructing a POSIX value.	
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   922
  % 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   923
  \begin{center}
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
   924
  \begin{tabular}{@ {}l@ {\hspace{-1mm}}cl@ {}}
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   925
  $(\textit{ZERO})\backslash c$ & $\;\dn\;$ & $\textit{ZERO}$\\ 
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   926
  $(\textit{ONE}\;bs)\backslash c$ & $\;\dn\;$ & $\textit{ZERO}$\\  
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   927
  $(\textit{CHAR}\;bs\,d)\backslash c$ & $\dn$ &
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   928
        $\textit{if}\;c=d\; \;\textit{then}\;
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   929
         \textit{ONE}\;bs\;\textit{else}\;\textit{ZERO}$\\  
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   930
  $(\textit{ALTs}\;bs\,\rs)\backslash c$ & $\dn$ &
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   931
        $\textit{ALTs}\,bs\,(\mathit{map}\,(\_\backslash c)\,\rs)$\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   932
  $(\textit{SEQ}\;bs\,r_1\,r_2)\backslash c$ & $\dn$ &
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   933
     $\textit{if}\;\textit{bnullable}\,r_1$\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   934
  & &$\textit{then}\;\textit{ALT}\,bs\;(\textit{SEQ}\,[]\,(r_1\backslash c)\,r_2)$
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   935
  $\;(\textit{fuse}\,(\textit{bmkeps}\,r_1)\,(r_2\backslash c))$\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   936
  & &$\textit{else}\;\textit{SEQ}\,bs\,(r_1\backslash c)\,r_2$\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   937
  $(\textit{STAR}\,bs\,r)\backslash c$ & $\dn$ &
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   938
      $\textit{SEQ}\;(bs\,@\,[\Z])\,(r\backslash c)\,
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   939
       (\textit{STAR}\,[]\,r)$\\
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   940
  $(\textit{NT}\,bs\,r\,n)\backslash c$ & $\dn$ &
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   941
      $\textit{if}\;n = 0\; \textit{then}\;\textit{ZERO}\;\textit{else}\;
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   942
      \textit{SEQ}\;(bs\,@\,[\Z])\,(r\backslash c)\,
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   943
       (\textit{NT}\,[]\,r\,(n - 1))$     
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   944
  \end{tabular}    
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   945
  \end{center}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   946
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   947
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   948
  \noindent
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   949
  This function can also be extended to strings, written $r\backslash s$,
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   950
  just like the standard derivative.  We omit the details. Finally we
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   951
  can define Sulzmann and Lu's bitcoded lexer, which we call \textit{blexer}:
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   952
  %
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   953
  \begin{center}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   954
\begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   955
  $\textit{blexer}\;r\,s$ & $\dn$ &
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   956
      $\textit{let}\;r_{der} = (r^\uparrow)\backslash s\;\textit{in}$\\               
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   957
  & & $\;\;\;\textit{if}\; \textit{bnullable}(r_{der})$
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   958
      $\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,r_{der})\,r$
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
   959
      $\;\textit{else}\;\textit{None}$
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   960
\end{tabular}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   961
\end{center}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   962
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   963
  \noindent
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   964
This bitcoded lexer first internalises the regular expression $r$ and then
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   965
builds the bitcoded derivative according to $s$. If the derivative is
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   966
(b)nullable the string is in the language of $r$ and it extracts the bitsequence using the
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   967
$\textit{bmkeps}$ function. Finally it decodes the bitsequence into a value.  If
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   968
the derivative is \emph{not} nullable, then $\textit{None}$ is
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   969
returned. We can show that this way of calculating a value
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   970
generates the same result as \textit{lexer}.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   971
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   972
Before we can proceed we need to define a helper function, called
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   973
\textit{retrieve}, which Sulzmann and Lu introduced for the correctness proof.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   974
%
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   975
\begin{center}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   976
  \begin{tabular}{lcl}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   977
  @{thm (lhs) retrieve.simps(1)} & $\dn$ & @{thm (rhs) retrieve.simps(1)}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   978
  @{thm (lhs) retrieve.simps(2)} & $\dn$ & @{thm (rhs) retrieve.simps(2)}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   979
  @{thm (lhs) retrieve.simps(3)} & $\dn$ & @{thm (rhs) retrieve.simps(3)}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   980
  @{thm (lhs) better_retrieve(1)} & $\dn$ & @{thm (rhs) better_retrieve(1)}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   981
  @{thm (lhs) better_retrieve(2)} & $\dn$ & @{thm (rhs) better_retrieve(2)}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   982
  @{thm (lhs) retrieve.simps(6)[of _ "r\<^sub>1" "r\<^sub>2" "v\<^sub>1" "v\<^sub>2"]}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   983
      & $\dn$ & @{thm (rhs) retrieve.simps(6)[of _ "r\<^sub>1" "r\<^sub>2" "v\<^sub>1" "v\<^sub>2"]}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   984
  @{thm (lhs) retrieve.simps(7)} & $\dn$ & @{thm (rhs) retrieve.simps(7)}\\
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   985
  @{thm (lhs) retrieve.simps(8)} & $\dn$ & @{thm (rhs) retrieve.simps(8)}\\
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   986
  @{thm (lhs) retrieve.simps(9)} & $\dn$ & @{thm (rhs) retrieve.simps(9)}\\
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
   987
  @{thm (lhs) better_retrieve2} & $\dn$ & @{thm (rhs) better_retrieve2}
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   988
  \end{tabular}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   989
  \end{center}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   990
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   991
\noindent
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   992
The idea behind this function is to retrieve a possibly partial
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   993
bitsequence from a bitcoded regular expression, where the retrieval is
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   994
guided by a value.  For example if the value is $\Left$ then we
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   995
descend into the left-hand side of an alternative in order to
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   996
assemble the bitcode. Similarly for
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   997
$\Right$. The property we can show is that for a given $v$ and $r$
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   998
with $\vdash v : r$, the retrieved bitsequence from the internalised
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   999
regular expression is equal to the bitcoded version of $v$.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1000
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1001
\begin{lemma}\label{retrievecode}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1002
If $\vdash v : r$ then $\textit{code}\, v = \textit{retrieve}\,(r^\uparrow)\,v$.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1003
\end{lemma}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1004
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1005
\noindent
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1006
We also need some auxiliary facts about how the bitcoded operations
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1007
relate to the ``standard'' operations on regular expressions. For
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1008
example if we build a bitcoded derivative and erase the result, this
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1009
is the same as if we first erase the bitcoded regular expression and
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1010
then perform the ``standard'' derivative operation.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1011
499
6a100d32314c updated the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 498
diff changeset
  1012
\begin{lemma}\label{bnullable}%\mbox{}\smallskip\\
6a100d32314c updated the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 498
diff changeset
  1013
\textit{(1)} $(r\backslash s)^\downarrow = (r^\downarrow)\backslash s$\\    
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1014
\mbox{\hspace{21.5mm}}\textit{(2)} $\textit{bnullable}(r)$ iff $\textit{nullable}(r^\downarrow)$\\
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1015
\mbox{\hspace{21.5mm}}\textit{(3)} $\textit{bmkeps}(r) = \textit{retrieve}\,r\,(\textit{mkeps}\,(r^\downarrow))$ provided $\textit{nullable}(r^\downarrow)$
499
6a100d32314c updated the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 498
diff changeset
  1016
%\begin{tabular}{ll}
6a100d32314c updated the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 498
diff changeset
  1017
%\textit{(1)} & $(r\backslash s)^\downarrow = (r^\downarrow)\backslash s$\\    
6a100d32314c updated the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 498
diff changeset
  1018
%\textit{(2)} & $\textit{bnullable}(r)$ iff $\textit{nullable}(r^\downarrow)$\\
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1019
%\textit{(3)} & $\textit{bmkeps}(r) = \textit{retrieve}\,r\,(\textit{mkeps}\,(r^\downarrow))$ \;provided\; $\textit{nullable}(r^\downarrow)$.
499
6a100d32314c updated the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 498
diff changeset
  1020
%\end{tabular}  
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1021
\end{lemma}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1022
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1023
%\begin{proof}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1024
%  All properties are by induction on annotated regular expressions.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1025
%  %There are no interesting cases.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1026
%\end{proof}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1027
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1028
\noindent
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1029
The only difficulty left for the correctness proof is that the bitcoded algorithm
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1030
has only a ``forward phase'' where POSIX values are generated incrementally.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1031
We can achieve the same effect with @{text lexer} (which has two phases) by stacking up injection
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1032
functions during the forward phase. An auxiliary function, called $\textit{flex}$,
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1033
allows us to recast the rules of $\lexer$ in terms of a single
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1034
phase and stacked up injection functions.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1035
%
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1036
\begin{center}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1037
\begin{tabular}{@ {}l@ {}c@ {}l@ {\hspace{10mm}}l@ {}c@ {}l@ {}}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1038
  $\textit{flex}\;r\,f\,[]$ & $\dn$ & $f$ & 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1039
  $\textit{flex}\;r\,f\,(c\!::\!s)$ & $\dn$ &
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1040
  $\textit{flex}\,(r\backslash c)\,(\lambda v.\,f\,(\inj\,r\,c\,v))\,s$
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1041
\end{tabular}    
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1042
\end{center}    
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1043
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1044
\noindent
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1045
The point of this function is that when
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1046
reaching the end of the string, we just need to apply the stacked up
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1047
injection functions to the value generated by @{text mkeps}.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1048
Using this function we can recast the success case in @{text lexer} 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1049
as follows:
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1050
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1051
\begin{lemma}\label{flex}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1052
If @{text "lexer r s = Some v"} \;then\; @{text "v = "}$\,\textit{flex}\,r\,id\,s\,
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1053
      (\textit{mkeps}\,(r\backslash s))$.
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1054
\end{lemma}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1055
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1056
\noindent
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1057
Note we did not redefine \textit{lexer}, we just established that the
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1058
value generated by \textit{lexer} can also be obtained by a different
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1059
method. While this different method is not efficient (we essentially
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1060
need to traverse the string $s$ twice, once for building the
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1061
derivative $r\backslash s$ and another time for stacking up injection
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1062
functions), it helps us with proving
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1063
that incrementally building up values in @{text blexer} generates the same result.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1064
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1065
This brings us to our main lemma in this section: if we calculate a
616
8907d4b6316d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 615
diff changeset
  1066
derivative, say $r\backslash s$, and have a value, say $v$, inhabiting
8907d4b6316d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 615
diff changeset
  1067
this derivative, then we can produce the result @{text lexer} generates
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1068
by applying this value to the stacked-up injection functions
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1069
that $\textit{flex}$ assembles. The lemma establishes that this is the same
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1070
value as if we build the annotated derivative $r^\uparrow\backslash s$
616
8907d4b6316d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 615
diff changeset
  1071
and then retrieve the bitcoded version of @{text v}, followed by a
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1072
decoding step.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1073
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
  1074
\begin{lemma}[Main Lemma]\label{mainlemma}\mbox{}\\\it
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1075
If $\vdash v : r\backslash s$ then 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1076
$\textit{Some}\,(\textit{flex}\,r\,\textit{id}\,s\,v) =
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1077
  \textit{decode}(\textit{retrieve}\,(r^\uparrow \backslash s)\,v)\,r$
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1078
\end{lemma}  
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1079
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1080
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1081
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1082
\noindent
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1083
%With this lemma in place,
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1084
We can then prove the correctness of \textit{blexer}---it indeed
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1085
produces the same result as \textit{lexer}.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1086
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1087
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1088
\begin{theorem}\label{thmone}
599
a5f666410101 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 578
diff changeset
  1089
$\textit{blexer}\,r\,s = \textit{lexer}\,r\,s$
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1090
\end{theorem}  
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1091
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1092
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1093
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1094
\noindent This establishes that the bitcoded algorithm \emph{without}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1095
simplification produces correct results. This was
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1096
only conjectured by Sulzmann and Lu in their paper
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1097
\cite{Sulzmann2014}. The next step is to add simplifications.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1098
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1099
*}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1100
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1101
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1102
section {* Simplification *}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1103
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1104
text {*
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1105
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1106
     Derivatives as calculated by Brzozowski's method are usually more
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1107
     complex regular expressions than the initial one; the result is
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1108
     that derivative-based matching and lexing algorithms are
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1109
     often abysmally slow if the ``growth problem'' is not addressed. As Sulzmann and Lu wrote, various
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1110
     optimisations are possible, such as the simplifications
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1111
     $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r \Rightarrow r$,
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1112
     $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow r$. While these
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1113
     simplifications can considerably speed up the two algorithms  in many
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1114
     cases, they do not solve fundamentally the growth problem with
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1115
     derivatives. To see this let us return to the example from the
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1116
     Introduction that shows the derivatives for \mbox{@{text "(a + aa)\<^sup>*"}}.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1117
     If we delete in the 3rd step all $\ZERO{}s$ and $\ONE$s according to
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1118
     the simplification rules shown above we obtain
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1119
     %
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1120
     %%
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1121
     %
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1122
     \begin{equation}\def\xll{\xrightarrow{\_\backslash{} [a, a, a]}}\label{derivex}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1123
     (a + aa)^* \quad\xll\quad
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1124
      \underbrace{\mbox{$(\ONE + a) \cdot (a + aa)^*$}}_{r} \;+\;
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1125
     ((a + aa)^* + \underbrace{\mbox{$(\ONE + a) \cdot (a + aa)^*$}}_{r})
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1126
     \end{equation}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1127
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1128
     \noindent This is a simpler derivative, but unfortunately we
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1129
     cannot make any further simplifications. This is a problem because
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1130
     the outermost alternatives contains two copies of the same
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1131
     regular expression (underlined with $r$). These copies will
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1132
     spawn new copies in later derivative steps and they in turn even more copies. This
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1133
     destroys any hope of taming the size of the derivatives.  But the
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1134
     second copy of $r$ in~\eqref{derivex} will never contribute to a
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1135
     value, because POSIX lexing will always prefer matching a string
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1136
     with the first copy. So it could be safely removed without affecting the correctness of the algorithm.
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
  1137
     The issue with the simple-minded
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1138
     simplification rules above is that the rule $r + r \Rightarrow r$
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1139
     will never be applicable because as can be seen in this example the
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1140
     regular expressions are not next to each other but separated by another regular expression.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1141
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1142
     But here is where Sulzmann and Lu's representation of generalised
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1143
     alternatives in the bitcoded algorithm shines: in @{term
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1144
     "ALTs bs rs"} we can define a more aggressive simplification by
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1145
     recursively simplifying all regular expressions in @{text rs} and
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1146
     then analyse the resulting list and remove any duplicates.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1147
     Another advantage with the bitsequences in  bitcoded regular
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1148
     expressions is that they can be easily modified such that simplification does not
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1149
     interfere with the value constructions. For example we can ``flatten'', or
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1150
     de-nest, or spill out, @{text ALTs} as follows
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1151
     %
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1152
     \[
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1153
     @{text "ALTs bs\<^sub>1 ((ALTs bs\<^sub>2 rs\<^sub>2) :: rs\<^sub>1)"}
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1154
     \quad\xrightarrow{bsimp}\quad
616
8907d4b6316d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 615
diff changeset
  1155
     @{text "ALTs bs\<^sub>1 ((map (fuse bs\<^sub>2) rs\<^sub>2) @ rs\<^sub>1)"}
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1156
     \]
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1157
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1158
     \noindent
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1159
     where we just need to fuse the bitsequence that has accumulated in @{text "bs\<^sub>2"}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1160
     to the alternatives in @{text "rs\<^sub>2"}. As we shall show below this will
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1161
     ensure that the correct value corresponding to the original (unsimplified)
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1162
     regular expression can still be extracted. %In this way the value construction
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1163
     %is not affected by simplification. 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1164
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1165
     However there is one problem with the definition for the more
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1166
     aggressive simplification rules described by Sulzmann and Lu. Recasting
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1167
     their definition with our syntax they define the step of removing
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1168
     duplicates as
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1169
     %
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1170
     \[ @{text "bsimp (ALTs bs rs)"} \dn @{text "ALTs
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1171
     bs (nub (map bsimp rs))"}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1172
     \]
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1173
   
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1174
     \noindent where they first recursively simplify the regular
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1175
     expressions in @{text rs} (using @{text map}) and then use
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1176
     Haskell's @{text nub}-function to remove potential
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1177
     duplicates. While this makes sense when considering the example
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1178
     shown in \eqref{derivex}, @{text nub} is the inappropriate
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1179
     function in the case of bitcoded regular expressions. The reason
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1180
     is that in general the elements in @{text rs} will have a
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1181
     different annotated bitsequence and in this way @{text nub}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1182
     will never find a duplicate to be removed. One correct way to
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1183
     handle this situation is to first \emph{erase} the regular
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1184
     expressions when comparing potential duplicates. This is inspired
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1185
     by Scala's list functions of the form \mbox{@{text "distinctWith rs eq
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1186
     acc"}} where @{text eq} is an user-defined equivalence relation that
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1187
     compares two elements in @{text rs}. We define this function in
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1188
     Isabelle/HOL as
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1189
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1190
     \begin{center}
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
  1191
     \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1192
     @{thm (lhs) distinctWith.simps(1)} & $\dn$ & @{thm (rhs) distinctWith.simps(1)}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1193
     @{thm (lhs) distinctWith.simps(2)} & $\dn$ &
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1194
     @{text "if (\<exists> y \<in> acc. eq x y)"}
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1195
     @{text "then distinctWith xs eq acc"}\\
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
  1196
     & & @{text "else x :: distinctWith xs eq ({x} \<union> acc)"}
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1197
     \end{tabular}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1198
     \end{center}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1199
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1200
     \noindent where we scan the list from left to right (because we
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1201
     have to remove later copies). In @{text distinctWith}, @{text eq} is intended to be an
616
8907d4b6316d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 615
diff changeset
  1202
     equivalence relation for bitcoded regular expressions and @{text acc} is an accumulator for bitcoded regular
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1203
     expressions---essentially a set of regular expressions that we have already seen
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1204
     while scanning the list. Therefore we delete an element, say @{text x},
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1205
     from the list provided a @{text "y"} with @{text "y"} being equivalent to @{text x} is already in the accumulator;
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1206
     otherwise we keep @{text x} and scan the rest of the list but 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1207
     add @{text "x"} as another ``seen'' element to @{text acc}. We will use
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1208
     @{term distinctWith} where @{text eq} is an equivalence that deletes bitsequences from bitcoded regular expressions
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1209
     before comparing the components. One way to define this in Isabelle/HOL is by the following recursive function from
616
8907d4b6316d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 615
diff changeset
  1210
     bitcoded regular expressions to @{text bool}:
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1211
     %
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1212
     \begin{center}
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1213
     \begin{tabular}{@ {}cc@ {}}
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1214
     \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {\hspace{1mm}}l@ {\hspace{1mm}}}
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1215
     @{thm (lhs) eq1.simps(1)} & $\dn$ & @{thm (rhs) eq1.simps(1)}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1216
     @{thm (lhs) eq1.simps(2)[of DUMMY DUMMY]} & $\dn$ & @{thm (rhs) eq1.simps(2)[of DUMMY DUMMY]}\\
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1217
     @{thm (lhs) eq1.simps(7)[of DUMMY "r\<^sub>1" DUMMY "r\<^sub>2"]} & $\dn$ & @{thm (rhs) eq1.simps(7)[of DUMMY "r\<^sub>1" DUMMY "r\<^sub>2"]}\\
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1218
      @{thm (lhs) eq1.simps(5)[of DUMMY DUMMY]} & $\dn$ & @{thm (rhs) eq1.simps(5)[of DUMMY DUMMY]}\\
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1219
     \mbox{}
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1220
     \end{tabular}
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1221
     &
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1222
     \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {\hspace{1mm}}l@ {}}
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1223
     @{thm (lhs) eq1.simps(3)[of DUMMY "c" DUMMY "d"]} & $\dn$ & @{thm (rhs) eq1.simps(3)[of DUMMY "c" DUMMY "d"]}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1224
     @{thm (lhs) eq1.simps(4)[of DUMMY "r\<^sub>1\<^sub>1" "r\<^sub>1\<^sub>2" DUMMY "r\<^sub>2\<^sub>1" "r\<^sub>2\<^sub>2"]} & $\dn$ &
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1225
     @{thm (rhs) eq1.simps(4)[of DUMMY "r\<^sub>1\<^sub>1" "r\<^sub>1\<^sub>2" DUMMY "r\<^sub>2\<^sub>1" "r\<^sub>2\<^sub>2"]}\\
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1226
     @{thm (lhs) eq1.simps(8)[of DUMMY "r\<^sub>1" "n\<^sub>1" DUMMY "r\<^sub>2" "n\<^sub>2"]} & $\dn$ & @{thm (rhs) eq1.simps(8)[of DUMMY "r\<^sub>1" "n\<^sub>1" DUMMY "r\<^sub>2" "n\<^sub>2"]}\\
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1227
     @{thm (lhs) eq1.simps(6)[of DUMMY "r\<^sub>1" "rs\<^sub>1" DUMMY "r\<^sub>2" "rs\<^sub>2"]} & $\dn$ & \\
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1228
     \multicolumn{3}{r}{@{thm (rhs) eq1.simps(6)[of DUMMY "r\<^sub>1" "rs\<^sub>1" DUMMY "r\<^sub>2" "rs\<^sub>2"]}}
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1229
     \end{tabular}
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1230
     \end{tabular}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1231
     \end{center}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1232
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1233
     \noindent
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1234
     where all other cases are set to @{text False}.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1235
     This equivalence is clearly a computationally more expensive operation than @{text nub},
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1236
     but is needed in order to make the removal of unnecessary copies
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1237
     to work properly.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1238
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1239
     Our simplification function depends on three more helper functions, one is called
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1240
     @{text flts} and analyses lists of regular expressions coming from alternatives.
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1241
     It is defined by four clauses as follows:
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1242
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1243
     \begin{center}
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1244
     \begin{tabular}{c}
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1245
     @{thm (lhs) flts.simps(1)} $\dn$ @{thm (rhs) flts.simps(1)} \qquad
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1246
     @{thm (lhs) flts.simps(2)} $\dn$ @{thm (rhs) flts.simps(2)} \qquad
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1247
     @{text "flts ((ALTs bs' rs') :: rs"}
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1248
     %@{ thm (lhs) flts.simps(3)[of "bs'" "rs'"]}
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1249
     $\dn$ @{thm (rhs) flts.simps(3)[of "bs'" "rs'"]}\smallskip\\
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1250
     @{text "flts (r :: rs)"} $\dn$ @{text "r :: flts rs"}\hspace{5mm}(otherwise)
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1251
     \end{tabular}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1252
     \end{center}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1253
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1254
     \noindent
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1255
     The second clause of @{text flts} removes all instances of @{text ZERO} in alternatives and
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1256
     the third ``de-nests'' alternatives (but retains the
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1257
     bitsequence @{text "bs'"} accumulated in the inner alternative). There are
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1258
     some corner cases to be considered when the resulting list inside an alternative is
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1259
     empty or a singleton list. We take care of those cases in the
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1260
     @{text "bsimpALTs"} function; similarly we define a helper function that simplifies
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1261
     sequences according to the usual rules about @{text ZERO}s and @{text ONE}s:
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1262
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1263
     \begin{center}
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
  1264
     \begin{tabular}{@ {}c@ {\hspace{4mm}}c@ {}}
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
  1265
     \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1266
     @{text "bsimpALTs bs []"}  & $\dn$ & @{text "ZERO"}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1267
     @{text "bsimpALTs bs [r]"} & $\dn$ & @{text "fuse bs r"}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1268
     @{text "bsimpALTs bs rs"}  & $\dn$ & @{text "ALTs bs rs"}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1269
     \mbox{}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1270
     \end{tabular}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1271
     &
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
  1272
     \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1273
     @{text "bsimpSEQ bs _ ZERO"}  & $\dn$ & @{text "ZERO"}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1274
     @{text "bsimpSEQ bs ZERO _"} & $\dn$ & @{text "ZERO"}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1275
     @{text "bsimpSEQ bs\<^sub>1 (ONE bs\<^sub>2) r\<^sub>2"}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1276
         & $\dn$ & @{text "fuse (bs\<^sub>1 @ bs\<^sub>2) r\<^sub>2"}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1277
     @{text "bsimpSEQ bs r\<^sub>1 r\<^sub>2"} & $\dn$ &  @{text "SEQ bs r\<^sub>1 r\<^sub>2"}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1278
     \end{tabular}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1279
     \end{tabular}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1280
     \end{center}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1281
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1282
     \noindent
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1283
     With this in place we can define our simplification function as
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1284
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1285
     \begin{center}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1286
     \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1287
     @{thm (lhs) bsimp.simps(1)[of "bs" "r\<^sub>1" "r\<^sub>2"]} & $\dn$ &
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1288
         @{thm (rhs) bsimp.simps(1)[of "bs" "r\<^sub>1" "r\<^sub>2"]}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1289
     @{thm (lhs) bsimp.simps(2)[of "bs" _]} & $\dn$ & @{text "bsimpALTs bs (distinctWith (flts (map bsimp rs)) \<approx> \<emptyset>)"}\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1290
     @{text "bsimp r"} & $\dn$ & @{text r}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1291
     \end{tabular}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1292
     \end{center}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1293
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1294
     \noindent
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1295
     We believe our recursive function @{term bsimp} simplifies bitcoded regular
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1296
     expressions as intended by Sulzmann and Lu with the small addition of removing ``useless'' @{text ONE}s
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1297
     in sequence regular
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1298
     expressions. There is no point in applying the
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1299
     @{text bsimp} function repeatedly (like the simplification in their paper which needs to be
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1300
     applied until a fixpoint is reached) because we can show that @{term bsimp} is idempotent,
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1301
     that is
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1302
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1303
     \begin{proposition}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1304
     @{term "bsimp (bsimp r) = bsimp r"}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1305
     \end{proposition}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1306
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1307
     \noindent
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1308
     This can be proved by induction on @{text r} but requires a detailed analysis
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1309
     that the de-nesting of alternatives always results in a flat list of regular
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1310
     expressions. We omit the details since it does not concern the correctness proof.
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1311
     %It might be interesting to not that we do not simplify inside @{term STAR} and
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1312
     %@{text NT}: the reason is that we want to keep the
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1313
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1314
     Next we can include simplification after each derivative step leading to the
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1315
     following notion of bitcoded derivatives:
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1316
     
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1317
     \begin{center}
616
8907d4b6316d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 615
diff changeset
  1318
      \begin{tabular}{c@ {\hspace{10mm}}c}
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1319
      \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1320
      @{thm (lhs) bders_simp.simps(1)} & $\dn$ & @{thm (rhs) bders_simp.simps(1)}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1321
      \end{tabular}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1322
      &
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1323
      \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1324
      @{thm (lhs) bders_simp.simps(2)} & $\dn$ & @{thm (rhs) bders_simp.simps(2)}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1325
      \end{tabular}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1326
      \end{tabular}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1327
      \end{center}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1328
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1329
      \noindent
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1330
      and use it in the improved lexing algorithm defined as
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1331
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1332
     \begin{center}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1333
\begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1334
  $\textit{blexer}^+\;r\,s$ & $\dn$ &
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
  1335
      $\textit{let}\;r_{der} = (r^\uparrow)\backslash_{bsimp}\, s\;\textit{in}$\\                
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1336
      & & $\;\;\;\textit{if}\; \textit{bnullable}(r_{der})$
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1337
      $\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,r_{der})\,r$ $\;\textit{else}\;\textit{None}$
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1338
\end{tabular}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1339
\end{center}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1340
616
8907d4b6316d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 615
diff changeset
  1341
       \noindent
8907d4b6316d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 615
diff changeset
  1342
       Note that in $\textit{blexer}^+$ the derivative $r_{der}$ is calculated
8907d4b6316d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 615
diff changeset
  1343
       using the simplifying derivative $\_\,\backslash_{bsimp}\,\_$.
8907d4b6316d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 615
diff changeset
  1344
       The remaining task is to show that @{term blexer} and
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1345
       @{term "blexer_simp"} generate the same answers.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1346
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1347
       When we first
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1348
       attempted this proof we encountered a problem with the idea
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1349
       in Sulzmann and Lu's paper where the argument seems to be to appeal
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1350
       again to the @{text retrieve}-function defined for the unsimplified version
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1351
       of the algorithm. But
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1352
       this does not work, because desirable properties such as
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1353
     %
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1354
     \[
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1355
     @{text "retrieve r v = retrieve (bsimp r) v"}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1356
     \]
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1357
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1358
     \noindent do not hold under simplification---this property
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1359
     essentially purports that we can retrieve the same value from a
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1360
     simplified version of the regular expression. To start with @{text retrieve}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1361
     depends on the fact that the value @{text v} corresponds to the
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1362
     structure of the regular expression @{text r}---but the whole point of simplification
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1363
     is to ``destroy'' this structure by making the regular expression simpler.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1364
     To see this consider the regular expression @{term "r = ALT r' ZERO"} and a corresponding
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1365
     value @{text "v = Left v'"}. If we annotate bitcodes to @{text "r"}, then 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1366
     we can use @{text retrieve} with @{text r} and @{text v} in order to extract a corresponding
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1367
     bitsequence. The reason that this works is that @{text r} is an alternative
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1368
     regular expression and @{text v} a corresponding @{text "Left"}-value. However, if we simplify
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1369
     @{text r}, then @{text v} does not correspond to the shape of the regular 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1370
     expression anymore. So unless one can somehow
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1371
     synchronise the change in the simplified regular expressions with
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1372
     the original POSIX value, there is no hope of appealing to @{text retrieve} in the
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1373
     correctness argument for @{term blexer_simp}.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1374
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1375
     For our proof we found it more helpful to introduce the rewriting systems shown in
616
8907d4b6316d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 615
diff changeset
  1376
     Fig~\ref{SimpRewrites}. The idea is to generate 
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1377
     simplified regular expressions in small steps (unlike the @{text bsimp}-function which
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1378
     does the same in a big step), and show that each of
616
8907d4b6316d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 615
diff changeset
  1379
     the small steps preserves the bitcodes that lead to the POSIX value.
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1380
     The rewrite system is organised such that $\leadsto$ is for bitcoded regular
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1381
     expressions and $\stackrel{s}{\leadsto}$ for lists of bitcoded regular
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1382
     expressions. The former essentially implements the simplifications of
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1383
     @{text "bsimpSEQ"} and @{text flts}; while the latter implements the
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1384
     simplifications in @{text "bsimpALTs"}. We can show that any bitcoded
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1385
     regular expression reduces in zero or more steps to the simplified
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1386
     regular expression generated by @{text bsimp}:
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1387
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1388
     \begin{lemma}\label{lemone}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1389
     @{thm[mode=IfThen] rewrites_to_bsimp}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1390
     \end{lemma}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1391
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1392
     
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1393
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1394
     \noindent
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1395
     We can also show that this rewrite system preserves @{term bnullable}, that 
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1396
     is simplification does not affect nullability:
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1397
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1398
     \begin{lemma}\label{lembnull}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1399
     @{thm[mode=IfThen] bnullable0(1)[of "r\<^sub>1" "r\<^sub>2"]}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1400
     \end{lemma}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1401
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1402
     
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1403
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1404
     \noindent
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1405
     From this, we can show that @{text bmkeps} will produce the same bitsequence
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1406
     as long as one of the bitcoded regular expressions in $\leadsto$ is nullable (this lemma
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1407
     establishes the missing fact we were not able to establish using @{text retrieve}, as suggested
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1408
     in the paper by Sulzmannn and Lu). 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1409
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1410
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1411
     \begin{lemma}\label{lemthree}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1412
     @{thm[mode=IfThen] rewrite_bmkeps_aux(1)[of "r\<^sub>1" "r\<^sub>2"]}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1413
     \end{lemma}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1414
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1415
     
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1416
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1417
     \noindent
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1418
     Crucial is also the fact that derivative steps and simplification steps can be interleaved,
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1419
     which is shown by the fact that $\leadsto$ is preserved under derivatives.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1420
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1421
     \begin{lemma}\label{bderlem}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1422
     @{thm[mode=IfThen] rewrite_preserves_bder(1)[of "r\<^sub>1" "r\<^sub>2"]}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1423
     \end{lemma}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1424
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1425
     
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1426
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1427
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1428
     \noindent
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1429
     Using this fact together with Lemma~\ref{lemone} allows us to prove the central lemma
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1430
     that the unsimplified
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1431
     derivative (with a string @{term s}) reduces to the simplified derivative (with the same string).
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1432
     
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1433
     \begin{lemma}\label{lemtwo}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1434
     @{thm[mode=IfThen] central}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1435
     \end{lemma}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1436
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1437
     %\begin{proof}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1438
     %By reverse induction on @{term s} generalising over @{text r}.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1439
     %\end{proof}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1440
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1441
     \noindent
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1442
     With these lemmas in place we can finally establish that @{term "blexer_simp"} and @{term "blexer"}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1443
     generate the same value, and using Theorem~\ref{thmone} from the previous section that this value
599
a5f666410101 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 578
diff changeset
  1444
     is indeed the POSIX value as generated by \textit{lexer}.
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1445
     
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1446
     \begin{theorem}
599
a5f666410101 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 578
diff changeset
  1447
     @{thm[mode=IfThen] main_blexer_simp[symmetric]} \; (@{text "= lexer r s"}\; by Thm.~\ref{thmone})
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1448
     \end{theorem}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1449
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1450
     %\begin{proof}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1451
     %By unfolding the definitions and using Lemmas~\ref{lemtwo} and \ref{lemthree}. 	
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1452
     %\end{proof}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1453
     
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1454
     \noindent
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1455
     This means that if the algorithm is called with a regular expression @{term r} and a string
498
ab626b60ee64 fixed tiny typo in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 496
diff changeset
  1456
     @{term s} with $@{term s} \in L(@{term r})$, it will return @{term "Some v"} for the unique
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1457
     @{term v} we defined by the POSIX relation $(@{term s}, @{term r}) \rightarrow @{term v}$; otherwise
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1458
     the algorithm returns @{term "None"} when $s \not\in L(r)$ and no such @{text v} exists.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1459
     This completes the correctness proof for the second POSIX lexing algorithm by Sulzmann and Lu.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1460
     The interesting point of this algorithm is that the sizes of derivatives do not grow arbitrarily big but
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1461
     can be finitely bounded, which
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1462
     we shall show next.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1463
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1464
   \begin{figure}[t]
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1465
  \begin{center}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1466
  \begin{tabular}{@ {\hspace{-8mm}}c@ {}}
643
9580bae0500d updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 642
diff changeset
  1467
  \\[-8mm]
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1468
  @{thm[mode=Axiom] bs1[of _ "r\<^sub>2"]}$S\ZERO{}_l$\quad
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1469
  @{thm[mode=Axiom] bs2[of _ "r\<^sub>1"]}$S\ZERO{}_r$\quad
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1470
  @{thm[mode=Axiom] bs3[of "bs\<^sub>1" "bs\<^sub>2"]}$S\ONE$\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1471
  @{thm[mode=Rule] bs4[of "r\<^sub>1" "r\<^sub>2" _ "r\<^sub>3"]}SL\qquad
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1472
  @{thm[mode=Rule] bs5[of "r\<^sub>3" "r\<^sub>4" _ "r\<^sub>1"]}SR\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1473
  @{thm[mode=Axiom] bs6}$A0$\quad
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1474
  @{thm[mode=Axiom] bs7}$A1$\quad
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1475
  @{thm[mode=Rule] bs10[of "rs\<^sub>1" "rs\<^sub>2"]}$AL$\\
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1476
  @{thm[mode=Rule] rrewrite_srewrite.ss2[of "rs\<^sub>1" "rs\<^sub>2"]}$LT$\quad
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1477
  @{thm[mode=Rule] ss3[of "r\<^sub>1" "r\<^sub>2"]}$LH$\quad
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1478
  @{thm[mode=Axiom] ss4}$L\ZERO$\quad
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1479
  @{thm[mode=Axiom] ss5[of "bs" "rs\<^sub>1" "rs\<^sub>2"]}$LS$\medskip\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1480
  @{thm[mode=Rule] ss6[of "r\<^sub>2" "r\<^sub>1" "rs\<^sub>1" "rs\<^sub>2" "rs\<^sub>3"]}$LD$\\[-5mm]
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1481
  \end{tabular}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1482
  \end{center}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1483
  \caption{The rewrite rules that generate simplified regular expressions
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1484
  in small steps: @{term "rrewrite r\<^sub>1 r\<^sub>2"} is for bitcoded regular
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1485
  expressions and @{term "srewrite rs\<^sub>1 rs\<^sub>2"} for \emph{lists} of bitcoded
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1486
  expressions. Interesting is the $LD$ rule that allows copies of regular
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1487
  expressions to be removed provided a regular expression earlier in the list can
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1488
  match the same strings.}\label{SimpRewrites}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1489
  \end{figure}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1490
*}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1491
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
  1492
section {* Finite Bound for the Size of Derivatives *}
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1493
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1494
text {*
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1495
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1496
In this section let us sketch our argument for why the size of the simplified
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1497
derivatives with the aggressive simplification function can be finitely bounded. Suppose
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1498
we have a size function for bitcoded regular expressions, written
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1499
$\llbracket r\rrbracket$, which counts the number of nodes if we regard $r$ as a tree
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1500
(we omit the precise definition; ditto for lists $\llbracket r\!s\rrbracket$). For this we show that for every $r$
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1501
there exists a bound $N$
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1502
such that 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1503
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1504
\begin{center}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1505
$\forall s. \; \llbracket@{term "bders_simp r s"}\rrbracket \leq N$
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1506
\end{center}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1507
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1508
\noindent
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1509
Note that the bound $N$ is a bound for \emph{all} strings, no matter how long they are.
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1510
We establish this bound by induction on $r$. The base cases for @{term AZERO},
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1511
@{term "AONE bs"} and @{term "ACHAR bs c"} are straightforward. The interesting case is
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1512
for sequences of the form @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}. In this case our induction
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1513
hypotheses state $\exists N_1. \forall s. \; \llbracket{}@{term "bders_simp r\<^sub>1 s"}\rrbracket \leq N_1$ and
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
  1514
$\exists N_2. \forall s. \; \llbracket@{term "bders_simp r\<^sub>2 s"}\rrbracket \leq N_2$. We can reason as follows\medskip
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1515
%
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
  1516
%!\begin{center}
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
  1517
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
  1518
\noindent\begin{minipage}{\textwidth}
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1519
\begin{tabular}{lcll}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1520
& & $ \llbracket@{term "bders_simp (ASEQ bs r\<^sub>1 r\<^sub>2) s"}\rrbracket$\\
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1521
& $ = $ & $\llbracket bsimp\,(\textit{ALTs}\;bs\;((@{term "ASEQ [] (bders_simp r\<^sub>1 s) r\<^sub>2"}) ::
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
  1522
    [@{term "bders_simp r\<^sub>2 s'"} \;|\; s' \in \textit{Suf}(@{text r}_1, s)]))\rrbracket $ & (1) \\
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1523
& $\leq$ &
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1524
    $\llbracket\textit{distinctWith}\,((@{term "ASEQ [] (bders_simp r\<^sub>1 s) r\<^sub>2"}) ::
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
  1525
    [@{term "bders_simp r\<^sub>2 s'"} \;|\; s' \in \textit{Suf}(@{text r}_1, s)])\,\approx\;@{term "{}"}\rrbracket + 1 $ & (2) \\
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1526
& $\leq$ & $\llbracket@{term "ASEQ [] (bders_simp r\<^sub>1 s) r\<^sub>2"}\rrbracket +
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
  1527
             \llbracket\textit{distinctWith}\,[@{term "bders_simp r\<^sub>2 s'"} \;|\; s' \in \textit{Suf}(@{text r}_1, s)]\,\approx\;@{term "{}"}\rrbracket + 1 $ & (3) \\
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1528
& $\leq$ & $N_1 + \llbracket r_2\rrbracket + 2 +
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
  1529
      \llbracket\textit{distinctWith}\,[@{term "bders_simp r\<^sub>2 s'"} \;|\; s' \in \textit{Suf}(@{text r}_1, s)]\,\approx\;@{term "{}"}\rrbracket$ & (4)\\
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1530
& $\leq$ & $N_1 + \llbracket r_2\rrbracket + 2 + l_{N_{2}} * N_{2}$ & (5)
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1531
\end{tabular}
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
  1532
\end{minipage}\medskip
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
  1533
%!\end{center}
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1534
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1535
% tell Chengsong about Indian paper of closed forms of derivatives
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1536
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1537
\noindent
616
8907d4b6316d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 615
diff changeset
  1538
where in (1) the set $\textit{Suf}(@{text "r"}_1, s)$ are all the suffixes of $s$ where @{term "bders_simp r\<^sub>1 s'"} is nullable ($s'$ being a suffix of $s$).
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1539
In (3) we know that  $\llbracket@{term "ASEQ [] (bders_simp r\<^sub>1 s) r\<^sub>2"}\rrbracket$ is 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1540
bounded by $N_1 + \llbracket{}r_2\rrbracket + 1$. In (5) we know the list comprehension contains only regular expressions of size smaller
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1541
than $N_2$. The list length after @{text distinctWith} is bounded by a number, which we call $l_{N_2}$. It stands
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1542
for the number of distinct regular expressions smaller than $N_2$ (there can only be finitely many of them).
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1543
We reason similarly for @{text STAR} and @{text NT}.\smallskip
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1544
499
6a100d32314c updated the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 498
diff changeset
  1545
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1546
Clearly we give in this finiteness argument (Step (5)) a very loose bound that is
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1547
far from the actual bound we can expect. We can do better than this, but this does not improve
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1548
the finiteness property we are proving. If we are interested in a polynomial bound,
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1549
one would hope to obtain a similar tight bound as for partial
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1550
derivatives introduced by Antimirov \cite{Antimirov95}. After all the idea with
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1551
@{text distinctWith} is to maintain a ``set'' of alternatives (like the sets in
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1552
partial derivatives). Unfortunately to obtain the exact same bound would mean
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1553
we need to introduce simplifications, such as
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1554
 $(r_1 + r_2) \cdot r_3 \longrightarrow (r_1 \cdot r_3) + (r_2 \cdot r_3)$,
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1555
which exist for partial derivatives. However, if we introduce them in our
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1556
setting we would lose the POSIX property of our calculated values. For example
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1557
given the regular expressions @{term "SEQ (ALT a ab) (ALT b ONE)"} and the string $[a, b]$, then our
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1558
algorithm generates the following correct POSIX value
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1559
%
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1560
\[
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1561
@{term "Seq (Right (Seq (Char a) (Char b))) (Right Empty)"}
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1562
\]
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1563
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1564
\noindent
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1565
Essentially it matches the string with the longer @{text "Right"}-alternative in the
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1566
first sequence (and then the `rest' with the empty regular expression @{const ONE} from the second sequence). 
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1567
If we add the simplification above, then we obtain the following value
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1568
@{term "Seq (Left (Char a)) (Left (Char b))"}
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1569
where the @{text "Left"}-alternatives get priority. However, this violates
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1570
the POSIX rules and we have not been able to
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
  1571
reconcile this problem. Therefore we leave better bounds for future work.%!\\[-6.5mm]
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
  1572
644
9f984ff20020 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 643
diff changeset
  1573
Note also that while Antimirov was able to give a bound on the \emph{size}
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
  1574
of his partial derivatives~\cite{Antimirov95}, Brzozowski gave a bound
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
  1575
on the \emph{number} of derivatives, provided they are quotient via
643
9580bae0500d updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 642
diff changeset
  1576
ACI rules \cite{Brzozowski1964}. Brzozowski's result is crucial when one
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1577
uses his derivatives for obtaining a DFA (it essentially bounds
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
  1578
the number of states). However, this result does \emph{not}
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
  1579
transfer to our setting where we are interested in the \emph{size} of the
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1580
derivatives. For example it is \emph{not} true for our derivatives that the
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1581
set of derivatives $r \backslash s$ for a given $r$ and all strings
617
3ea6a38c33b5 small polish
Christian Urban <christian.urban@kcl.ac.uk>
parents: 616
diff changeset
  1582
$s$ is finite (even when simplified). This is because for our annotated regular expressions
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
  1583
the bitcode annotation is dependent on the number of iterations that
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1584
are needed for @{text STAR}-regular expressions. This is not a problem for us: Since we intend to do lexing
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
  1585
by calculating (as fast as possible) derivatives, the bound on the size
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1586
of the derivatives is important, not their number. % of derivatives.
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
  1587
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1588
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1589
*}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1590
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1591
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1592
section {* Conclusion *}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1593
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1594
text {*
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1595
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1596
   We set out in this work to prove in Isabelle/HOL the correctness of
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1597
   the second POSIX lexing algorithm by Sulzmann and Lu
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1598
   \cite{Sulzmann2014}. This follows earlier work where we established
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1599
   the correctness of the first algorithm
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1600
   \cite{AusafDyckhoffUrban2016}. In the earlier work we needed to
643
9580bae0500d updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 642
diff changeset
  1601
   introduce our own specification for POSIX values,
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1602
   because the informal definition given by Sulzmann and Lu did not
616
8907d4b6316d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 615
diff changeset
  1603
   stand up to formal proof. Also for the second algorithm we needed
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1604
   to introduce our own definitions and proof ideas in order to
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1605
   establish the correctness.  Our interest in the second algorithm
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1606
   lies in the fact that by using bitcoded regular expressions and an
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1607
   aggressive simplification method there is a chance that the
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1608
   derivatives can be kept universally small (we established in this
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1609
   paper that for a given $r$ they can be kept finitely bounded for
643
9580bae0500d updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 642
diff changeset
  1610
   \emph{all} strings).  Our formalisation is approximately 7500 lines of
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1611
   Isabelle code. A little more than half of this code concerns the finiteness bound
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1612
   obtained in Section 5. This slight ``bloat'' in the latter part is because
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1613
   we had to introduce an intermediate datatype for annotated regular expressions and repeat many
643
9580bae0500d updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 642
diff changeset
  1614
   definitions for this intermediate datatype. But overall we think this made our
9580bae0500d updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 642
diff changeset
  1615
   formalisation work smoother. The code of our formalisation can be found at
9580bae0500d updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 642
diff changeset
  1616
   \textcolor{darkblue}{\url{https://github.com/urbanchr/posix}}.
499
6a100d32314c updated the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 498
diff changeset
  1617
   %This is important if one is after
6a100d32314c updated the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 498
diff changeset
  1618
   %an efficient POSIX lexing algorithm based on derivatives.
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1619
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1620
   Having proved the correctness of the POSIX lexing algorithm, which
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1621
   lessons have we learned? Well, we feel this is a very good example
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1622
   where formal proofs give further insight into the matter at
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1623
   hand. For example it is very hard to see a problem with @{text nub}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1624
   vs @{text distinctWith} with only experimental data---one would still
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1625
   see the correct result but find that simplification does not simplify in well-chosen, but not
499
6a100d32314c updated the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 498
diff changeset
  1626
   obscure, examples.
6a100d32314c updated the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 498
diff changeset
  1627
   %We found that from an implementation
6a100d32314c updated the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 498
diff changeset
  1628
   %point-of-view it is really important to have the formal proofs of
6a100d32314c updated the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 498
diff changeset
  1629
   %the corresponding properties at hand.
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1630
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1631
   With the results reported here, we can of course only make a claim
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1632
   about the correctness of the algorithm and the sizes of the
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1633
   derivatives, not about the efficiency or runtime of our version of
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1634
   Sulzmann and Lu's algorithm. But we found the size is an important
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1635
   first indicator about efficiency: clearly if the derivatives can
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1636
   grow to arbitrarily big sizes and the algorithm needs to traverse
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1637
   the derivatives possibly several times, then the algorithm will be
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1638
   slow---excruciatingly slow that is. Other works seem to make
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1639
   stronger claims, but during our formalisation work we have
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1640
   developed a healthy suspicion when for example experimental data is
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1641
   used to back up efficiency claims. For instance Sulzmann and Lu
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1642
   write about their equivalent of @{term blexer_simp} \textit{``...we
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1643
   can incrementally compute bitcoded parse trees in linear time in
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1644
   the size of the input''} \cite[Page 14]{Sulzmann2014}.  Given the
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1645
   growth of the derivatives in some cases even after aggressive
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1646
   simplification, this is a hard to believe claim. A similar claim
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1647
   about a theoretical runtime of @{text "O(n\<^sup>2)"} for one
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1648
   specific list of regular expressions is made for the Verbatim
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1649
   lexer, which calculates tokens according to POSIX
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1650
   rules~\cite{verbatim}. For this, Verbatim uses Brzozowski's
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1651
   derivatives like in our work.  About their empirical data, the authors write:
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1652
   \textit{``The results of our empirical tests [..] confirm that
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1653
   Verbatim has @{text "O(n\<^sup>2)"} time complexity.''}
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1654
   \cite[Section~VII]{verbatim}.  While their correctness proof for
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1655
   Verbatim is formalised in Coq, the claim about the runtime
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1656
   complexity is only supported by some empirical evidence obtained by
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1657
   using the code extraction facilities of Coq.  Given our observation
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1658
   with the ``growth problem'' of derivatives, this runtime bound
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1659
   is unlikely to hold universally: indeed we tried out their extracted OCaml
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1660
   code with the example \mbox{@{text "(a + aa)\<^sup>*"}} as a single
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1661
   lexing rule, and it took for us around 5 minutes to tokenise a
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1662
   string of 40 $a$'s and that increased to approximately 19 minutes
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1663
   when the string is 50 $a$'s long. Taking into account that
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1664
   derivatives are not simplified in the Verbatim lexer, such numbers
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1665
   are not surprising.  Clearly our result of having finite
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1666
   derivatives might sound rather weak in this context but we think
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1667
   such efficiency claims really require further scrutiny.  The
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1668
   contribution of this paper is to make sure derivatives do not grow
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1669
   arbitrarily big (universally). In the example \mbox{@{text "(a +
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1670
   aa)\<^sup>*"}}, \emph{all} derivatives have a size of 17 or
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1671
   less. The result is that lexing a string of, say, 50\,000 a's with
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1672
   the regular expression \mbox{@{text "(a + aa)\<^sup>*"}} takes
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1673
   approximately 10 seconds with our Scala implementation of the
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1674
   presented algorithm.
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1675
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1676
   Finally, let us come back to the point about bounded regular
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1677
   expressions. We have in this paper only shown that @{term "NTIMES r
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1678
   n"} can be included, but all our results extend straightforwardly
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1679
   also to the other bounded regular expressions. We find bounded
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1680
   regular expressions fit naturally into the setting of Brzozowski
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1681
   derivatives and the bitcoded regular expressions by Sulzmann and Lu.
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1682
   In contrast bounded regular expressions are often the Achilles'
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1683
   heel in regular expression matchers that use the traditional
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1684
   automata-based approach to lexing, primarily because they need to expand the
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1685
   counters of bounded regular expressions into $n$-connected copies
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1686
   of an automaton. This is not needed in Sulzmann and Lu's algorithm.
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1687
   To see the difference consider for example the regular expression @{term "SEQ (NTIMES a
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1688
   1001) (STAR a)"}, which is not permitted in the Go language because
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1689
   the counter is too big. In contrast we have no problem with
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1690
   matching this regular expression with, say 50\,000 a's, because the
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1691
   counters can be kept compact. In fact, the overall size of the
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1692
   derivatives is never greater than 5 in this example. Even in the
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1693
   example from Section 2, where Rust raises an error message, namely
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1694
   \mbox{@{text "a\<^bsup>{1000}{100}{5}\<^esup>"}}, the maximum size for
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1695
   our derivatives is a moderate 14.
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1696
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1697
   Let us also return to the example @{text
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1698
   "a\<^bsup>{0}{4294967295}\<^esup>"} which until recently Rust
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1699
   deemed acceptable. But this was due to a bug. It turns out that it took Rust
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1700
   more than 11 minutes to generate an automaton for this regular
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1701
   expression and then to determine that a string of just one(!)
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1702
   @{text a} does \emph{not} match this regular expression.  Therefore
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1703
   it is probably a wise choice that in newer versions of Rust's
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1704
   regular expression library such regular expressions are declared as
643
9580bae0500d updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 642
diff changeset
  1705
   ``too big'' and raise an error message. While this is clearly
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1706
   a contrived example, the safety guaranties Rust wants to provide necessitate
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1707
   this conservative approach.
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1708
   However, with the derivatives and simplifications we have shown
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1709
   in this paper, the example can be solved with ease:
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1710
   it essentially only involves operations on
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1711
   integers and our Scala implementation takes only a few seconds to
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1712
   find out that this string, or even much larger strings, do not match.
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1713
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1714
   Let us also compare our work to the verified Verbatim++ lexer where
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1715
   the authors of the Verbatim lexer introduced a number of
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1716
   improvements and optimisations, for example memoisation
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1717
   \cite{verbatimpp}. However, unlike Verbatim, which works with
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1718
   derivatives like in our work, Verbatim++ compiles first a regular
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1719
   expression into a DFA. While this makes lexing fast in many cases,
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1720
   with examples of bounded regular expressions like
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1721
   \mbox{@{text "a\<^bsup>{100}{5}\<^esup>"}}
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1722
   one needs to represent them as
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1723
   sequences of $a \cdot a \cdot \ldots \cdot a$ (500 a's in sequence). We have run
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1724
   their extracted code with such a regular expression as a
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1725
   single lexing rule and a string of 50\,000 a's---lexing in this case
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1726
   takes approximately 5~minutes. We are not aware of any better
616
8907d4b6316d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 615
diff changeset
  1727
   translation using the traditional notion of DFAs so that we can improve on this. Therefore we
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1728
   prefer to stick with calculating derivatives, but attempt to make
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1729
   this calculation (in the future) as fast as possible. What we can guaranty
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1730
   with the presented work is that the maximum size of the derivatives
616
8907d4b6316d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 615
diff changeset
  1731
   for \mbox{@{text "a\<^bsup>{100}{5}\<^esup>"}$\,\cdot\,$@{term "STAR a"}} is never bigger than 9. This means our Scala
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1732
   implementation again only needs a few seconds for this example and matching 50\,000 a's, say.
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1733
   %
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1734
   %
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1735
   %Possible ideas are
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1736
   %zippers which have been used in the context of parsing of
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1737
   %context-free grammars \cite{zipperparser}.
643
9580bae0500d updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 642
diff changeset
  1738
   %\\[-5mm]  %\smallskip
642
6c13f76c070b updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 617
diff changeset
  1739
   
643
9580bae0500d updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 642
diff changeset
  1740
   %\noindent
9580bae0500d updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 642
diff changeset
  1741
   %Our Isabelle code including the results from Sec.~5 is available from \url{https://github.com/urbanchr/posix}.
9580bae0500d updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 642
diff changeset
  1742
   %%\\[-10mm]
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1743
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1744
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1745
%%\bibliographystyle{plain}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1746
\bibliography{root}
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
  1747
*}
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1748
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
  1749
(*<*)
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
  1750
text {*
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1751
\newpage
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1752
\appendix
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1753
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1754
\section{Some Proofs}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1755
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1756
While we have formalised \emph{all} results in Isabelle/HOL, below we shall give some
569
5af61c89f51e updated paper and corresponding theories
Christian Urban <christian.urban@kcl.ac.uk>
parents: 563
diff changeset
  1757
rough details of our reasoning in ``English'' if this is helpful for reviewing.
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1758
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1759
\begin{proof}[Proof of Lemma~\ref{codedecode}]
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1760
  This follows from the property that
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1761
  $\textit{decode}'\,((\textit{code}\,v) \,@\, bs)\,r = (v, bs)$ holds
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1762
  for any bit-sequence $bs$ and $\vdash v : r$. This property can be
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1763
  easily proved by induction on $\vdash v : r$.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1764
\end{proof}  
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1765
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1766
\begin{proof}[Proof of Lemma~\ref{mainlemma}]
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1767
  This can be proved by induction on $s$ and generalising over
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1768
  $v$. The interesting point is that we need to prove this in the
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1769
  reverse direction for $s$. This means instead of cases $[]$ and
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1770
  $c\!::\!s$, we have cases $[]$ and $s\,@\,[c]$ where we unravel the
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1771
  string from the back.\footnote{Isabelle/HOL provides an induction principle
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1772
    for this way of performing the induction.}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1773
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1774
  The case for $[]$ is routine using Lemmas~\ref{codedecode}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1775
  and~\ref{retrievecode}. In the case $s\,@\,[c]$, we can infer from
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1776
  the assumption that $\vdash v : (r\backslash s)\backslash c$
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1777
  holds. Hence by Prop.~\ref{Posix2} we know that 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1778
  (*) $\vdash \inj\,(r\backslash s)\,c\,v : r\backslash s$ holds too.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1779
  By definition of $\textit{flex}$ we can unfold the left-hand side
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1780
  to be
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1781
  \[
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1782
    \textit{Some}\,(\textit{flex}\;r\,\textit{id}\,(s\,@\,[c])\,v) =
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1783
    \textit{Some}\,(\textit{flex}\;r\,\textit{id}\,s\,(\inj\,(r\backslash s)\,c\,v))  
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1784
  \]  
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1785
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1786
  \noindent
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1787
  By IH and (*) we can rewrite the right-hand side to
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1788
  $\textit{decode}\,(\textit{retrieve}\,(r^\uparrow\backslash s)\;
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1789
    (\inj\,(r\backslash s)\,c\,\,v))\,r$ which is equal to
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1790
  $\textit{decode}\,(\textit{retrieve}\, (r^\uparrow\backslash
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1791
  (s\,@\,[c]))\,v)\,r$ as required. The last rewrite step is possible
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1792
  because we generalised over $v$ in our induction.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1793
\end{proof}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1794
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1795
\begin{proof}[Proof of Theorem~\ref{thmone}]
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1796
  We can first expand both sides using Lem.~\ref{flex} and the
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1797
  definition of \textit{blexer}. This gives us two
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1798
  \textit{if}-statements, which we need to show to be equal. By 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1799
  Lemma~\ref{bnullable}\textit{(2)} we know the \textit{if}-tests coincide:
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1800
  $\textit{bnullable}(r^\uparrow\backslash s) \;\textit{iff}\;
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1801
   \nullable(r\backslash s)$.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1802
  For the \textit{if}-branch suppose $r_d \dn r^\uparrow\backslash s$ and
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1803
  $d \dn r\backslash s$. We have (*) @{text "nullable d"}. We can then show
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1804
  by Lemma~\ref{bnullable}\textit{(3)} that
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1805
  %
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1806
  \[
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1807
    \textit{decode}(\textit{bmkeps}\:r_d)\,r =
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1808
    \textit{decode}(\textit{retrieve}\,r_d\,(\textit{mkeps}\,d))\,r
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1809
  \]
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1810
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1811
  \noindent
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1812
  where the right-hand side is equal to
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1813
  $\textit{Some}\,(\textit{flex}\,r\,\textit{id}\,s\,(\textit{mkeps}\,
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1814
  d))$ by Lemma~\ref{mainlemma} (we know
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1815
  $\vdash \textit{mkeps}\,d : d$ by (*)).  This shows the
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1816
  \textit{if}-branches return the same value. In the
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1817
  \textit{else}-branches both \textit{lexer} and \textit{blexer} return
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1818
  \textit{None}. Therefore we can conclude the proof.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1819
\end{proof}  
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1820
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1821
\begin{proof}[Proof of Lemma~\ref{lemone}]
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1822
     By induction on @{text r}. For this we can use the properties
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1823
     @{thm fltsfrewrites} and @{text "rs"}$\;\stackrel{s}{\leadsto}^*$@{text "distinctWith rs \<approx>"} @{term "{}"}. The latter uses
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1824
     repeated applications of the $LD$ rule which allows the removal
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1825
     of duplicates that can recognise the same strings. 
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1826
     \end{proof}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1827
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1828
    \begin{proof}[Proof of Lemma~\ref{lembnull}]
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1829
     Straightforward mutual induction on the definition of $\leadsto$ and $\stackrel{s}{\leadsto}$.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1830
     The only interesting case is the rule $LD$ where the property holds since by the side-conditions of that rule the empty string will
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1831
     be in both @{text "L (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ [r\<^sub>2] @ rs\<^sub>c)"} and
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1832
     @{text "L (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ rs\<^sub>c)"}.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1833
     \end{proof}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1834
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1835
 \begin{proof}[Proof of Lemma \ref{lemthree}]
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1836
     By straightforward mutual induction on the definition of $\leadsto$ and $\stackrel{s}{\leadsto}$.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1837
     Again the only interesting case is the rule $LD$ where we need to ensure that
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1838
     @{text "bmkeps (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ [r\<^sub>2] @ rs\<^sub>c) =
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1839
        bmkeps (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ rs\<^sub>c)"} holds.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1840
     This is indeed the case because according to the POSIX rules the
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1841
     generated bitsequence is determined by the first alternative that can match the
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1842
     string (in this case being nullable).
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1843
     \end{proof}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1844
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1845
     \begin{proof}[Proof of Lemma~\ref{bderlem}]
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1846
     By straightforward mutual induction on the definition of $\leadsto$ and $\stackrel{s}{\leadsto}$.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1847
     The case for $LD$ holds because @{term "L (erase (bder c r\<^sub>2)) \<subseteq> L (erase (bder c r\<^sub>1))"}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1848
     if and only if @{term "L (erase (r\<^sub>2)) \<subseteq> L (erase (r\<^sub>1))"}.
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1849
     \end{proof}
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1850
*}
615
8881a09a06fd updated paper for FoSSaCS
Christian Urban <christian.urban@kcl.ac.uk>
parents: 599
diff changeset
  1851
(*>*)
496
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1852
(*<*)
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1853
end
f493a20feeb3 updated to include the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1854
(*>*)