thys2/Paper/Paper.thy
author Chengsong
Wed, 12 Oct 2022 14:01:33 +0100
changeset 613 b0f0d884a547
parent 474 726f4e65c0fe
permissions -rw-r--r--
chap5
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
396
cc8e231529fb added ITP paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
     1
(*<*)
cc8e231529fb added ITP paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
     2
theory Paper
cc8e231529fb added ITP paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
     3
imports 
cc8e231529fb added ITP paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
     4
   "../Lexer"
cc8e231529fb added ITP paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
     5
   "../Simplifying" 
cc8e231529fb added ITP paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
     6
   "../Positions"
cc8e231529fb added ITP paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
     7
   "../SizeBound4" 
cc8e231529fb added ITP paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
     8
   "HOL-Library.LaTeXsugar"
cc8e231529fb added ITP paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
     9
begin
397
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
    10
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
    11
declare [[show_question_marks = false]]
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
    12
398
dac6d27c99c6 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 397
diff changeset
    13
notation (latex output)
dac6d27c99c6 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 397
diff changeset
    14
  If  ("(\<^latex>\<open>\\textrm{\<close>if\<^latex>\<open>}\<close> (_)/ \<^latex>\<open>\\textrm{\<close>then\<^latex>\<open>}\<close> (_)/ \<^latex>\<open>\\textrm{\<close>else\<^latex>\<open>}\<close> (_))" 10) and
dac6d27c99c6 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 397
diff changeset
    15
  Cons ("_\<^latex>\<open>\\mbox{$\\,$}\<close>::\<^latex>\<open>\\mbox{$\\,$}\<close>_" [75,73] 73) 
dac6d27c99c6 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 397
diff changeset
    16
dac6d27c99c6 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 397
diff changeset
    17
397
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
    18
abbreviation 
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
    19
  "der_syn r c \<equiv> der c r"
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
    20
abbreviation 
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
    21
 "ders_syn r s \<equiv> ders s r"  
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
    22
abbreviation 
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
    23
  "bder_syn r c \<equiv> bder c r"  
397
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
    24
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
    25
notation (latex output)
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
    26
  der_syn ("_\\_" [79, 1000] 76) and
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
    27
  ders_syn ("_\\_" [79, 1000] 76) and
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
    28
  bder_syn ("_\\_" [79, 1000] 76) and
420
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
    29
  bders ("_\\_" [79, 1000] 76) and
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
    30
  bders_simp ("_\\\<^sub>b\<^sub>s\<^sub>i\<^sub>m\<^sub>p _" [79, 1000] 76) and
397
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
    31
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
    32
  ZERO ("\<^bold>0" 81) and 
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
    33
  ONE ("\<^bold>1" 81) and 
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
    34
  CH ("_" [1000] 80) and
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
    35
  ALT ("_ + _" [77,77] 78) and
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
    36
  SEQ ("_ \<cdot> _" [77,77] 78) and
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
    37
  STAR ("_\<^sup>*" [79] 78) and
402
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
    38
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
    39
  val.Void ("Empty" 78) and
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
    40
  val.Char ("Char _" [1000] 78) and
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
    41
  val.Left ("Left _" [79] 78) and
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
    42
  val.Right ("Right _" [1000] 78) and
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
    43
  val.Seq ("Seq _ _" [79,79] 78) and
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
    44
  val.Stars ("Stars _" [79] 78) and
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
    45
420
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
    46
  Prf ("\<turnstile> _ : _" [75,75] 75) and  
402
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
    47
  Posix ("'(_, _') \<rightarrow> _" [63,75,75] 75) and
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
    48
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
    49
  flat ("|_|" [75] 74) and
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
    50
  flats ("|_|" [72] 74) and
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
    51
  injval ("inj _ _ _" [79,77,79] 76) and 
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
    52
  mkeps ("mkeps _" [79] 76) and 
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
    53
  length ("len _" [73] 73) and
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
    54
  set ("_" [73] 73) and
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
    55
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
    56
  AZERO ("ZERO" 81) and 
405
3cfea5bb5e23 updated some of the text and cardinality proof
Christian Urban <christian.urban@kcl.ac.uk>
parents: 402
diff changeset
    57
  AONE ("ONE _" [79] 78) and 
402
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
    58
  ACHAR ("CHAR _ _" [79, 79] 80) and
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
    59
  AALTs ("ALTs _ _" [77,77] 78) and
405
3cfea5bb5e23 updated some of the text and cardinality proof
Christian Urban <christian.urban@kcl.ac.uk>
parents: 402
diff changeset
    60
  ASEQ ("SEQ _ _ _" [79, 79,79] 78) and
402
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
    61
  ASTAR ("STAR _ _" [79, 79] 78) and
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
    62
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
    63
  code ("code _" [79] 74) and
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
    64
  intern ("_\<^latex>\<open>\\mbox{$^\\uparrow$}\<close>" [900] 80) and
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
    65
  erase ("_\<^latex>\<open>\\mbox{$^\\downarrow$}\<close>" [1000] 74) and
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
    66
  bnullable ("bnullable _" [1000] 80) and
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
    67
  bsimp_AALTs ("bsimpALT _ _" [10,1000] 80) and
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
    68
  bsimp_ASEQ ("bsimpSEQ _ _ _" [10,1000,1000] 80) and
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
    69
  bmkeps ("bmkeps _" [1000] 80) and
405
3cfea5bb5e23 updated some of the text and cardinality proof
Christian Urban <christian.urban@kcl.ac.uk>
parents: 402
diff changeset
    70
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
    71
  srewrite ("_\<^latex>\<open>\\mbox{$\\,\\stackrel{s}{\\leadsto}$}\<close> _" [71, 71] 80) and
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
    72
  rrewrites ("_ \<^latex>\<open>\\mbox{$\\,\\leadsto^*$}\<close> _" [71, 71] 80) and
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
    73
  srewrites ("_ \<^latex>\<open>\\mbox{$\\,\\stackrel{s}{\\leadsto}^*$}\<close> _" [71, 71] 80) and
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
    74
  blexer_simp ("blexer\<^sup>+" 1000) 
402
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
    75
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
    76
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
    77
lemma better_retrieve:
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
    78
   shows "rs \<noteq> Nil ==> retrieve (AALTs bs (r#rs)) (Left v) = bs @ retrieve r v"
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
    79
   and   "rs \<noteq> Nil ==> retrieve (AALTs bs (r#rs)) (Right v) = bs @ retrieve (AALTs [] rs) v"
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
    80
  apply (metis list.exhaust retrieve.simps(4))
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
    81
  by (metis list.exhaust retrieve.simps(5))
397
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
    82
396
cc8e231529fb added ITP paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    83
(*>*)
397
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
    84
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
    85
section {* Introduction *}
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
    86
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
    87
text {*
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
    88
400
46e5566ad4ba updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 398
diff changeset
    89
In the last fifteen or so years, Brzozowski's derivatives of regular
46e5566ad4ba updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 398
diff changeset
    90
expressions have sparked quite a bit of interest in the functional
46e5566ad4ba updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 398
diff changeset
    91
programming and theorem prover communities.  The beauty of
46e5566ad4ba updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 398
diff changeset
    92
Brzozowski's derivatives \cite{Brzozowski1964} is that they are neatly
46e5566ad4ba updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 398
diff changeset
    93
expressible in any functional language, and easily definable and
46e5566ad4ba updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 398
diff changeset
    94
reasoned about in theorem provers---the definitions just consist of
420
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
    95
inductive datatypes and simple recursive functions.  Derivatives of a
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
    96
regular expression, written @{term "der c r"}, give a simple solution
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
    97
to the problem of matching a string @{term s} with a regular
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
    98
expression @{term r}: if the derivative of @{term r} w.r.t.\ (in
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
    99
succession) all the characters of the string matches the empty string,
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   100
then @{term r} matches @{term s} (and {\em vice versa}).  We are aware
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   101
of a mechanised correctness proof of Brzozowski's derivative-based matcher in HOL4 by
420
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   102
Owens and Slind~\cite{Owens2008}. Another one in Isabelle/HOL is part
474
726f4e65c0fe made paper changes after ITP comments
Christian Urban <christian.urban@kcl.ac.uk>
parents: 464
diff changeset
   103
of the work by Krauss and Nipkow~\cite{Krauss2011}.  And another one
420
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   104
in Coq is given by Coquand and Siles \cite{Coquand2012}.
464
e6248d2c20c2 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 463
diff changeset
   105
Also Ribeiro and Du Bois give one in Agda~\cite{RibeiroAgda2017}.
420
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   106
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   107
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   108
However, there are two difficulties with derivative-based matchers:
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   109
First, Brzozowski's original matcher only generates a yes/no answer
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   110
for whether a regular expression matches a string or not.  This is too
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   111
little information in the context of lexing where separate tokens must
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   112
be identified and also classified (for example as keywords
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   113
or identifiers).  Sulzmann and Lu~\cite{Sulzmann2014} overcome this
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   114
difficulty by cleverly extending Brzozowski's matching
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   115
algorithm. Their extended version generates additional information on
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   116
\emph{how} a regular expression matches a string following the POSIX
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   117
rules for regular expression matching. They achieve this by adding a
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   118
second ``phase'' to Brzozowski's algorithm involving an injection
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   119
function.  In our own earlier work we provided the formal
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   120
specification of what POSIX matching means and proved in Isabelle/HOL
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   121
the correctness
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   122
of Sulzmann and Lu's extended algorithm accordingly
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   123
\cite{AusafDyckhoffUrban2016}.
400
46e5566ad4ba updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 398
diff changeset
   124
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   125
The second difficulty is that Brzozowski's derivatives can 
420
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   126
grow to arbitrarily big sizes. For example if we start with the
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   127
regular expression \mbox{@{text "(a + aa)\<^sup>*"}} and take
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   128
successive derivatives according to the character $a$, we end up with
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   129
a sequence of ever-growing derivatives like 
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   130
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   131
\def\ll{\stackrel{\_\backslash{} a}{\longrightarrow}}
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   132
\begin{center}
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   133
\begin{tabular}{rll}
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   134
$(a + aa)^*$ & $\ll$ & $(\ONE + \ONE{}a) \cdot (a + aa)^*$\\
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   135
& $\ll$ & $(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* \;+\; (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   136
& $\ll$ & $(\ZERO + \ZERO{}a + \ZERO) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^* \;+\; $\\
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   137
& & $\qquad(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   138
& $\ll$ & \ldots \hspace{15mm}(regular expressions of sizes 98, 169, 283, 468, 767, \ldots)
420
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   139
\end{tabular}
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   140
\end{center}
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   141
 
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   142
\noindent where after around 35 steps we run out of memory on a
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   143
typical computer (we shall define shortly the precise details of our
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   144
regular expressions and the derivative operation).  Clearly, the
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   145
notation involving $\ZERO$s and $\ONE$s already suggests
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   146
simplification rules that can be applied to regular regular
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   147
expressions, for example $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   148
\Rightarrow r$, $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   149
r$. While such simple-minded simplifications have been proved in our
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   150
earlier work to preserve the correctness of Sulzmann and Lu's
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   151
algorithm \cite{AusafDyckhoffUrban2016}, they unfortunately do
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   152
\emph{not} help with limiting the growth of the derivatives shown
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   153
above: the growth is slowed, but the derivatives can still grow rather
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   154
quickly beyond any finite bound.
420
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   155
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   156
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   157
Sulzmann and Lu overcome this ``growth problem'' in a second algorithm
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   158
\cite{Sulzmann2014} where they introduce bitcoded
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   159
regular expressions. In this version, POSIX values are
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   160
represented as bitsequences and such sequences are incrementally generated
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   161
when derivatives are calculated. The compact representation
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   162
of bitsequences and regular expressions allows them to define a more
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   163
``aggressive'' simplification method that keeps the size of the
474
726f4e65c0fe made paper changes after ITP comments
Christian Urban <christian.urban@kcl.ac.uk>
parents: 464
diff changeset
   164
derivatives finitely bounded no matter what the length of the string is.
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   165
They make some informal claims about the correctness and linear behaviour
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   166
of this version, but do not provide any supporting proof arguments, not
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   167
even ``pencil-and-paper'' arguments. They write about their bitcoded
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   168
\emph{incremental parsing method} (that is the algorithm to be formalised
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   169
in this paper):
420
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   170
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   171
\begin{quote}\it
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   172
  ``Correctness Claim: We further claim that the incremental parsing
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   173
  method [..] in combination with the simplification steps [..]
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   174
  yields POSIX parse trees. We have tested this claim
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   175
  extensively [..] but yet
426
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
   176
  have to work out all proof details.'' \cite[Page 14]{Sulzmann2014}
420
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   177
\end{quote}  
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   178
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   179
\noindent{}\textbf{Contributions:} We have implemented in Isabelle/HOL
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   180
the derivative-based lexing algorithm of Sulzmann and Lu
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   181
\cite{Sulzmann2014} where regular expressions are annotated with
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   182
bitsequences. We define the crucial simplification function as a
426
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
   183
recursive function, without the need of a fix-point operation. One objective of
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   184
the simplification function is to remove duplicates of regular
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   185
expressions.  For this Sulzmann and Lu use in their paper the standard
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   186
@{text nub} function from Haskell's list library, but this function
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   187
does not achieve the intended objective with bitcoded regular expressions.  The
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   188
reason is that in the bitcoded setting, each copy generally has a
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   189
different bitcode annotation---so @{text nub} would never ``fire''.
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   190
Inspired by Scala's library for lists, we shall instead use a @{text
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   191
distinctBy} function that finds duplicates under an erasing function
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
   192
which deletes bitcodes.
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   193
We shall also introduce our own argument and definitions for
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   194
establishing the correctness of the bitcoded algorithm when 
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   195
simplifications are included.\medskip
420
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   196
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   197
\noindent In this paper, we shall first briefly introduce the basic notions
474
726f4e65c0fe made paper changes after ITP comments
Christian Urban <christian.urban@kcl.ac.uk>
parents: 464
diff changeset
   198
of regular expressions and describe the definition
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   199
of POSIX lexing from our earlier work \cite{AusafDyckhoffUrban2016}. This serves
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   200
as a reference point for what correctness means in our Isabelle/HOL proofs. We shall then prove
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   201
the correctness for the bitcoded algorithm without simplification, and
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   202
after that extend the proof to include simplification. 
400
46e5566ad4ba updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 398
diff changeset
   203
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   204
*}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   205
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   206
section {* Background *}
397
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   207
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   208
text {*
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   209
  In our Isabelle/HOL formalisation strings are lists of characters with
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   210
  the empty string being represented by the empty list, written $[]$,
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   211
  and list-cons being written as $\_\!::\!\_\,$; string
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   212
  concatenation is $\_ \,@\, \_\,$. We often use the usual
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   213
  bracket notation for lists also for strings; for example a string
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   214
  consisting of just a single character $c$ is written $[c]$.   
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   215
  Our regular expressions are defined as usual as the elements of the following inductive
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   216
  datatype:
397
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   217
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   218
  \begin{center}
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   219
  @{text "r ::="} \;
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   220
  @{const "ZERO"} $\mid$
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   221
  @{const "ONE"} $\mid$
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   222
  @{term "CH c"} $\mid$
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   223
  @{term "ALT r\<^sub>1 r\<^sub>2"} $\mid$
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   224
  @{term "SEQ r\<^sub>1 r\<^sub>2"} $\mid$
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   225
  @{term "STAR r"} 
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   226
  \end{center}
397
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   227
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   228
  \noindent where @{const ZERO} stands for the regular expression that does
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   229
  not match any string, @{const ONE} for the regular expression that matches
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   230
  only the empty string and @{term c} for matching a character literal.
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   231
  The constructors $+$ and $\cdot$ represent alternatives and sequences, respectively.
461
c4b6906068a9 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 460
diff changeset
   232
  We sometimes omit the $\cdot$ in a sequence regular expression for brevity. 
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   233
  The
464
e6248d2c20c2 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 463
diff changeset
   234
  \emph{language} of a regular expression, written $L(r)$, is defined as usual
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   235
  and we omit giving the definition here (see for example \cite{AusafDyckhoffUrban2016}).
420
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   236
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   237
  Central to Brzozowski's regular expression matcher are two functions
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   238
  called @{text nullable} and \emph{derivative}. The latter is written
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   239
  $r\backslash c$ for the derivative of the regular expression $r$
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   240
  w.r.t.~the character $c$. Both functions are defined by recursion over
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   241
  regular expressions.  
420
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   242
402
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   243
\begin{center}
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   244
\begin{tabular}{cc}
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   245
  \begin{tabular}{r@ {\hspace{2mm}}c@ {\hspace{2mm}}l}
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   246
  @{thm (lhs) der.simps(1)} & $\dn$ & @{thm (rhs) der.simps(1)}\\
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   247
  @{thm (lhs) der.simps(2)} & $\dn$ & @{thm (rhs) der.simps(2)}\\
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   248
  @{thm (lhs) der.simps(3)} & $\dn$ & @{thm (rhs) der.simps(3)}\\
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   249
  @{thm (lhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]}\\
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   250
  @{thm (lhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{text "if"} @{term "nullable(r\<^sub>1)"}\\
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   251
  & & @{text "then"} @{term "ALT (SEQ (der c r\<^sub>1) r\<^sub>2) (der c r\<^sub>2)"}\\
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   252
  & & @{text "else"} @{term "SEQ (der c r\<^sub>1) r\<^sub>2"}\\
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   253
  % & & @{thm (rhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]}\\
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   254
  @{thm (lhs) der.simps(6)} & $\dn$ & @{thm (rhs) der.simps(6)}
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   255
  \end{tabular}
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   256
  &
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   257
  \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   258
  @{thm (lhs) nullable.simps(1)} & $\dn$ & @{thm (rhs) nullable.simps(1)}\\
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   259
  @{thm (lhs) nullable.simps(2)} & $\dn$ & @{thm (rhs) nullable.simps(2)}\\
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   260
  @{thm (lhs) nullable.simps(3)} & $\dn$ & @{thm (rhs) nullable.simps(3)}\\
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   261
  @{thm (lhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   262
  @{thm (lhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   263
  @{thm (lhs) nullable.simps(6)} & $\dn$ & @{thm (rhs) nullable.simps(6)}\medskip\\
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   264
  \end{tabular}  
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   265
\end{tabular}  
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   266
\end{center}
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   267
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   268
  \noindent
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   269
  We can extend this definition to give derivatives w.r.t.~strings:
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   270
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   271
  \begin{center}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   272
  \begin{tabular}{cc}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   273
  \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   274
  @{thm (lhs) ders.simps(1)} & $\dn$ & @{thm (rhs) ders.simps(1)}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   275
  \end{tabular}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   276
  &
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   277
  \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   278
  @{thm (lhs) ders.simps(2)} & $\dn$ & @{thm (rhs) ders.simps(2)}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   279
  \end{tabular}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   280
  \end{tabular}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   281
  \end{center}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   282
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   283
\noindent
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   284
Using @{text nullable} and the derivative operation, we can
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   285
define the following simple regular expression matcher:
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   286
%
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   287
\[
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   288
@{text "match s r"} \;\dn\; @{term nullable}(r\backslash s)
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   289
\]
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   290
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   291
\noindent This is essentially Brzozowski's algorithm from 1964. Its
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   292
main virtue is that the algorithm can be easily implemented as a
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   293
functional program (either in a functional programming language or in
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   294
a theorem prover). The correctness proof for @{text match} amounts to
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   295
establishing the property
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   296
%
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   297
\begin{proposition}\label{matchcorr} 
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   298
@{text "match s r"} \;\;\text{if and only if}\;\; $s \in L(r)$
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   299
\end{proposition}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   300
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   301
\noindent
426
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
   302
It is a fun exercise to formally prove this property in a theorem prover.
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   303
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   304
The novel idea of Sulzmann and Lu is to extend this algorithm for 
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   305
lexing, where it is important to find out which part of the string
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   306
is matched by which part of the regular expression.
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   307
For this Sulzmann and Lu presented two lexing algorithms in their paper
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   308
  \cite{Sulzmann2014}. The first algorithm consists of two phases: first a
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   309
  matching phase (which is Brzozowski's algorithm) and then a value
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   310
  construction phase. The values encode \emph{how} a regular expression
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   311
  matches a string. \emph{Values} are defined as the inductive datatype
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   312
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   313
  \begin{center}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   314
  @{text "v :="}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   315
  @{const "Void"} $\mid$
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   316
  @{term "val.Char c"} $\mid$
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   317
  @{term "Left v"} $\mid$
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   318
  @{term "Right v"} $\mid$
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   319
  @{term "Seq v\<^sub>1 v\<^sub>2"} $\mid$ 
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   320
  @{term "Stars vs"} 
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   321
  \end{center}  
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   322
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   323
  \noindent where we use @{term vs} to stand for a list of values. The
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   324
  string underlying a value can be calculated by a @{const flat}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   325
  function, written @{term "flat DUMMY"}. It traverses a value and
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   326
  collects the characters contained in it. Sulzmann and Lu also define inductively an
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   327
  inhabitation relation that associates values to regular expressions:
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   328
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   329
  \begin{center}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   330
  \begin{tabular}{c}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   331
  \\[-8mm]
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   332
  @{thm[mode=Axiom] Prf.intros(4)} \qquad
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   333
  @{thm[mode=Axiom] Prf.intros(5)[of "c"]}\\[4mm]
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   334
  @{thm[mode=Rule] Prf.intros(2)[of "v\<^sub>1" "r\<^sub>1" "r\<^sub>2"]} \qquad 
458
30c91ea7095b updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 436
diff changeset
   335
  @{thm[mode=Rule] Prf.intros(3)[of "v\<^sub>2" "r\<^sub>2" "r\<^sub>1"]}\\[4mm]
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   336
  @{thm[mode=Rule] Prf.intros(1)[of "v\<^sub>1" "r\<^sub>1" "v\<^sub>2" "r\<^sub>2"]} \qquad
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   337
  @{thm[mode=Rule] Prf.intros(6)[of "vs" "r"]}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   338
  \end{tabular}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   339
  \end{center}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   340
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   341
  \noindent Note that no values are associated with the regular expression
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   342
  @{term ZERO}, since it cannot match any string.
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   343
  It is routine to establish how values ``inhabiting'' a regular
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   344
  expression correspond to the language of a regular expression, namely
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   345
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   346
  \begin{proposition}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   347
  @{thm L_flat_Prf}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   348
  \end{proposition}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   349
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   350
  In general there is more than one value inhabited by a regular
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   351
  expression (meaning regular expressions can typically match more
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   352
  than one string). But even when fixing a string from the language of the
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   353
  regular expression, there are generally more than one way of how the
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   354
  regular expression can match this string. POSIX lexing is about
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   355
  identifying the unique value for a given regular expression and a
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   356
  string that satisfies the informal POSIX rules (see
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   357
  \cite{POSIX,Kuklewicz,OkuiSuzuki2010,Sulzmann2014,Vansummeren2006}).\footnote{POSIX
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   358
	lexing acquired its name from the fact that the corresponding
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   359
	rules were described as part of the POSIX specification for
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   360
	Unix-like operating systems \cite{POSIX}.} Sometimes these
474
726f4e65c0fe made paper changes after ITP comments
Christian Urban <christian.urban@kcl.ac.uk>
parents: 464
diff changeset
   361
  informal rules are called \emph{maximal munch rule} and \emph{rule priority}.
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   362
  One contribution of our earlier paper is to give a convenient
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
   363
 specification for what POSIX values are (the inductive rules are shown in
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   364
  Figure~\ref{POSIXrules}).
397
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   365
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   366
\begin{figure}[t]
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   367
  \begin{center}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   368
  \begin{tabular}{c}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   369
  @{thm[mode=Axiom] Posix.intros(1)}\<open>P\<close>@{term "ONE"} \qquad
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   370
  @{thm[mode=Axiom] Posix.intros(2)}\<open>P\<close>@{term "c"}\medskip\\
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   371
  @{thm[mode=Rule] Posix.intros(3)[of "s" "r\<^sub>1" "v" "r\<^sub>2"]}\<open>P+L\<close>\qquad
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   372
  @{thm[mode=Rule] Posix.intros(4)[of "s" "r\<^sub>2" "v" "r\<^sub>1"]}\<open>P+R\<close>\medskip\\
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   373
  $\mprset{flushleft}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   374
   \inferrule
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   375
   {@{thm (prem 1) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \qquad
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   376
    @{thm (prem 2) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \\\\
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   377
    @{thm (prem 3) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   378
   {@{thm (concl) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}$\<open>PS\<close>\medskip\smallskip\\
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   379
  @{thm[mode=Axiom] Posix.intros(7)}\<open>P[]\<close>\qquad
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   380
  $\mprset{flushleft}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   381
   \inferrule
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   382
   {@{thm (prem 1) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   383
    @{thm (prem 2) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   384
    @{thm (prem 3) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \\\\
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   385
    @{thm (prem 4) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   386
   {@{thm (concl) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}$\<open>P\<star>\<close>\\[-4mm]
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   387
  \end{tabular}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   388
  \end{center}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   389
  \caption{The inductive definition of POSIX values taken from our earlier paper \cite{AusafDyckhoffUrban2016}. The ternary relation, written $(s, r) \rightarrow v$, formalises the notion
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   390
  of given a string $s$ and a regular
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   391
  expression $r$ what is the unique value $v$ that satisfies the informal POSIX constraints for
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   392
  regular expression matching.}\label{POSIXrules}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   393
  \end{figure}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   394
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   395
  The clever idea by Sulzmann and Lu \cite{Sulzmann2014} in their first algorithm is to define
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   396
  an injection function on values that mirrors (but inverts) the
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   397
  construction of the derivative on regular expressions. Essentially it
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   398
  injects back a character into a value.
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   399
  For this they define two functions called @{text mkeps} and @{text inj}:
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   400
 
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   401
  \begin{center}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   402
  \begin{tabular}{l}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   403
  \begin{tabular}{lcl}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   404
  @{thm (lhs) mkeps.simps(1)} & $\dn$ & @{thm (rhs) mkeps.simps(1)}\\
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   405
  @{thm (lhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]}\\
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   406
  @{thm (lhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]}\\
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   407
  @{thm (lhs) mkeps.simps(4)} & $\dn$ & @{thm (rhs) mkeps.simps(4)}\\
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   408
  \end{tabular}\smallskip\\
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   409
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   410
  \begin{tabular}{lcl}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   411
  @{thm (lhs) injval.simps(1)} & $\dn$ & @{thm (rhs) injval.simps(1)}\\
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   412
  @{thm (lhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]} & $\dn$ & 
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   413
      @{thm (rhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]}\\
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   414
  @{thm (lhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$ & 
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   415
      @{thm (rhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   416
  @{thm (lhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$ 
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   417
      & @{thm (rhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   418
  @{thm (lhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$ 
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   419
      & @{thm (rhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   420
  @{thm (lhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$ 
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   421
      & @{thm (rhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   422
  @{thm (lhs) injval.simps(7)[of "r" "c" "v" "vs"]} & $\dn$ 
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   423
      & @{thm (rhs) injval.simps(7)[of "r" "c" "v" "vs"]}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   424
  \end{tabular}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   425
  \end{tabular}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   426
  \end{center}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   427
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   428
  \noindent
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   429
  The function @{text mkeps} is run when the last derivative is nullable, that is
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   430
  the string to be matched is in the language of the regular expression. It generates
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   431
  a value for how the last derivative can match the empty string. The injection function
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   432
  then calculates the corresponding value for each intermediate derivative until
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   433
  a value for the original regular expression is generated.
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   434
  Graphically the algorithm by
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   435
  Sulzmann and Lu can be illustrated by the picture in Figure~\ref{Sulz}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   436
  where the path from the left to the right involving @{term derivatives}/@{const
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   437
  nullable} is the first phase of the algorithm (calculating successive
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   438
  \Brz's derivatives) and @{const mkeps}/@{text inj}, the path from right to
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   439
  left, the second phase. The picture above shows the steps required when a
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   440
  regular expression, say @{text "r\<^sub>1"}, matches the string @{term
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
   441
  "[a,b,c]"}. The first lexing algorithm by Sulzmann and Lu can be defined as:
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   442
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   443
  \begin{figure}[t]
397
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   444
\begin{center}
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   445
\begin{tikzpicture}[scale=2,node distance=1.3cm,
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   446
                    every node/.style={minimum size=6mm}]
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   447
\node (r1)  {@{term "r\<^sub>1"}};
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   448
\node (r2) [right=of r1]{@{term "r\<^sub>2"}};
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   449
\draw[->,line width=1mm](r1)--(r2) node[above,midway] {@{term "der a DUMMY"}};
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   450
\node (r3) [right=of r2]{@{term "r\<^sub>3"}};
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   451
\draw[->,line width=1mm](r2)--(r3) node[above,midway] {@{term "der b DUMMY"}};
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   452
\node (r4) [right=of r3]{@{term "r\<^sub>4"}};
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   453
\draw[->,line width=1mm](r3)--(r4) node[above,midway] {@{term "der c DUMMY"}};
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   454
\draw (r4) node[anchor=west] {\;\raisebox{3mm}{@{term nullable}}};
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   455
\node (v4) [below=of r4]{@{term "v\<^sub>4"}};
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   456
\draw[->,line width=1mm](r4) -- (v4);
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   457
\node (v3) [left=of v4] {@{term "v\<^sub>3"}};
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   458
\draw[->,line width=1mm](v4)--(v3) node[below,midway] {\<open>inj r\<^sub>3 c\<close>};
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   459
\node (v2) [left=of v3]{@{term "v\<^sub>2"}};
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   460
\draw[->,line width=1mm](v3)--(v2) node[below,midway] {\<open>inj r\<^sub>2 b\<close>};
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   461
\node (v1) [left=of v2] {@{term "v\<^sub>1"}};
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   462
\draw[->,line width=1mm](v2)--(v1) node[below,midway] {\<open>inj r\<^sub>1 a\<close>};
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   463
\draw (r4) node[anchor=north west] {\;\raisebox{-8mm}{@{term "mkeps"}}};
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   464
\end{tikzpicture}
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   465
\end{center}
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   466
\mbox{}\\[-13mm]
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   467
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   468
\caption{The two phases of the first algorithm by Sulzmann \& Lu \cite{Sulzmann2014},
397
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   469
matching the string @{term "[a,b,c]"}. The first phase (the arrows from 
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   470
left to right) is \Brz's matcher building successive derivatives. If the 
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   471
last regular expression is @{term nullable}, then the functions of the 
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   472
second phase are called (the top-down and right-to-left arrows): first 
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   473
@{term mkeps} calculates a value @{term "v\<^sub>4"} witnessing
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   474
how the empty string has been recognised by @{term "r\<^sub>4"}. After
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   475
that the function @{term inj} ``injects back'' the characters of the string into
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   476
the values. The value @{term "v\<^sub>1"} is the result of the algorithm representing
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   477
the POSIX value for this string and
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   478
regular expression.
397
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   479
\label{Sulz}}
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   480
\end{figure} 
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   481
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   482
420
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   483
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   484
  \begin{center}
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   485
  \begin{tabular}{lcl}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   486
  @{thm (lhs) lexer.simps(1)} & $\dn$ & @{thm (rhs) lexer.simps(1)}\\
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   487
  @{thm (lhs) lexer.simps(2)} & $\dn$ & @{text "case"} @{term "lexer (der c r) s"} @{text of}\\
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   488
                     & & \phantom{$|$} @{term "None"}  @{text "\<Rightarrow>"} @{term None}\\
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   489
                     & & $|$ @{term "Some v"} @{text "\<Rightarrow>"} @{term "Some (injval r c v)"}                          
420
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   490
  \end{tabular}
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   491
  \end{center}
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   492
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   493
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   494
We have shown in our earlier paper \cite{AusafDyckhoffUrban2016} that
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   495
this algorithm is correct, that is it generates POSIX values. The
426
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
   496
central property we established relates the derivative operation to the
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   497
injection function.
420
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   498
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   499
  \begin{proposition}\label{Posix2}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   500
	\textit{If} $(s,\; r\backslash c) \rightarrow v$ \textit{then} $(c :: s,\; r) \rightarrow$ \textit{inj} $r\; c\; v$. 
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   501
\end{proposition}
420
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   502
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   503
  \noindent
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   504
  With this in place we were able to prove:
397
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   505
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   506
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   507
  \begin{proposition}\mbox{}\smallskip\\\label{lexercorrect}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   508
  \begin{tabular}{ll}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   509
  (1) & @{thm (lhs) lexer_correct_None} if and only if @{thm (rhs) lexer_correct_None}\\
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   510
  (2) & @{thm (lhs) lexer_correct_Some} if and only if @{thm (rhs) lexer_correct_Some}\\
397
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   511
  \end{tabular}
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   512
  \end{proposition}
397
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   513
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   514
  \noindent
474
726f4e65c0fe made paper changes after ITP comments
Christian Urban <christian.urban@kcl.ac.uk>
parents: 464
diff changeset
   515
  In fact we have shown that, in the success case, the generated POSIX value $v$ is
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   516
  unique and in the failure case that there is no POSIX value $v$ that satisfies
426
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
   517
  $(s, r) \rightarrow v$. While the algorithm is correct, it is excruciatingly
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
   518
  slow in cases where the derivatives grow arbitrarily (recall the example from the
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   519
  Introduction). However it can be used as a convenient reference point for the correctness
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   520
  proof of the second algorithm by Sulzmann and Lu, which we shall describe next.
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   521
  
397
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   522
*}
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   523
402
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   524
section {* Bitcoded Regular Expressions and Derivatives *}
397
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   525
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   526
text {*
405
3cfea5bb5e23 updated some of the text and cardinality proof
Christian Urban <christian.urban@kcl.ac.uk>
parents: 402
diff changeset
   527
416
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   528
  In the second part of their paper \cite{Sulzmann2014},
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   529
  Sulzmann and Lu describe another algorithm that also generates POSIX
426
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
   530
  values but dispenses with the second phase where characters are
405
3cfea5bb5e23 updated some of the text and cardinality proof
Christian Urban <christian.urban@kcl.ac.uk>
parents: 402
diff changeset
   531
  injected ``back'' into values. For this they annotate bitcodes to
3cfea5bb5e23 updated some of the text and cardinality proof
Christian Urban <christian.urban@kcl.ac.uk>
parents: 402
diff changeset
   532
  regular expressions, which we define in Isabelle/HOL as the datatype
402
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   533
405
3cfea5bb5e23 updated some of the text and cardinality proof
Christian Urban <christian.urban@kcl.ac.uk>
parents: 402
diff changeset
   534
  \begin{center}
3cfea5bb5e23 updated some of the text and cardinality proof
Christian Urban <christian.urban@kcl.ac.uk>
parents: 402
diff changeset
   535
  \begin{tabular}{lcl}
416
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   536
  @{term breg} & $::=$ & @{term "AZERO"} $\quad\mid\quad$ @{term "AONE bs"}\\
405
3cfea5bb5e23 updated some of the text and cardinality proof
Christian Urban <christian.urban@kcl.ac.uk>
parents: 402
diff changeset
   537
               & $\mid$ & @{term "ACHAR bs c"}\\
3cfea5bb5e23 updated some of the text and cardinality proof
Christian Urban <christian.urban@kcl.ac.uk>
parents: 402
diff changeset
   538
               & $\mid$ & @{term "AALTs bs rs"}\\
3cfea5bb5e23 updated some of the text and cardinality proof
Christian Urban <christian.urban@kcl.ac.uk>
parents: 402
diff changeset
   539
               & $\mid$ & @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}\\
3cfea5bb5e23 updated some of the text and cardinality proof
Christian Urban <christian.urban@kcl.ac.uk>
parents: 402
diff changeset
   540
               & $\mid$ & @{term "ASTAR bs r"}
3cfea5bb5e23 updated some of the text and cardinality proof
Christian Urban <christian.urban@kcl.ac.uk>
parents: 402
diff changeset
   541
  \end{tabular}
3cfea5bb5e23 updated some of the text and cardinality proof
Christian Urban <christian.urban@kcl.ac.uk>
parents: 402
diff changeset
   542
  \end{center}
3cfea5bb5e23 updated some of the text and cardinality proof
Christian Urban <christian.urban@kcl.ac.uk>
parents: 402
diff changeset
   543
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   544
  \noindent where @{text bs} stands for bitsequences; @{text r},
416
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   545
  @{text "r\<^sub>1"} and @{text "r\<^sub>2"} for bitcoded regular
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   546
  expressions; and @{text rs} for lists of bitcoded regular
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   547
  expressions. The binary alternative @{text "ALT bs r\<^sub>1 r\<^sub>2"}
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   548
  is just an abbreviation for \mbox{@{text "ALTs bs [r\<^sub>1, r\<^sub>2]"}}. 
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
   549
  For bitsequences we use lists made up of the
416
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   550
  constants @{text Z} and @{text S}.  The idea with bitcoded regular
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   551
  expressions is to incrementally generate the value information (for
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   552
  example @{text Left} and @{text Right}) as bitsequences. For this 
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   553
  Sulzmann and Lu define a coding
416
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   554
  function for how values can be coded into bitsequences.
402
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   555
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   556
  \begin{center}
416
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   557
  \begin{tabular}{cc}
402
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   558
  \begin{tabular}{lcl}
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   559
  @{thm (lhs) code.simps(1)} & $\dn$ & @{thm (rhs) code.simps(1)}\\
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   560
  @{thm (lhs) code.simps(2)} & $\dn$ & @{thm (rhs) code.simps(2)}\\
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   561
  @{thm (lhs) code.simps(3)} & $\dn$ & @{thm (rhs) code.simps(3)}\\
416
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   562
  @{thm (lhs) code.simps(4)} & $\dn$ & @{thm (rhs) code.simps(4)}
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   563
  \end{tabular}
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   564
  &
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   565
  \begin{tabular}{lcl}
402
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   566
  @{thm (lhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]} & $\dn$ & @{thm (rhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]}\\
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   567
  @{thm (lhs) code.simps(6)} & $\dn$ & @{thm (rhs) code.simps(6)}\\
416
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   568
  @{thm (lhs) code.simps(7)} & $\dn$ & @{thm (rhs) code.simps(7)}\\
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   569
  \mbox{\phantom{XX}}\\
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   570
  \end{tabular}
402
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   571
  \end{tabular}
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   572
  \end{center}
416
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   573
   
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   574
  \noindent
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   575
  As can be seen, this coding is ``lossy'' in the sense that we do not
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   576
  record explicitly character values and also not sequence values (for
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   577
  them we just append two bitsequences). However, the
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   578
  different alternatives for @{text Left}, respectively @{text Right}, are recorded as @{text Z} and
416
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   579
  @{text S} followed by some bitsequence. Similarly, we use @{text Z} to indicate
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   580
  if there is still a value coming in the list of @{text Stars}, whereas @{text S}
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   581
  indicates the end of the list. The lossiness makes the process of
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   582
  decoding a bit more involved, but the point is that if we have a
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   583
  regular expression \emph{and} a bitsequence of a corresponding value,
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   584
  then we can always decode the value accurately. The decoding can be
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   585
  defined by using two functions called $\textit{decode}'$ and
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   586
  \textit{decode}:
402
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   587
416
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   588
  \begin{center}
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   589
  \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   590
  $\textit{decode}'\,bs\,(\ONE)$ & $\dn$ & $(\Empty, bs)$\\
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   591
  $\textit{decode}'\,bs\,(c)$ & $\dn$ & $(\Char\,c, bs)$\\
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   592
  $\textit{decode}'\,(\Z\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   593
     $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}\;
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   594
       (\Left\,v, bs_1)$\\
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   595
  $\textit{decode}'\,(\S\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   596
     $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_2\;\textit{in}\;
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   597
       (\Right\,v, bs_1)$\\                           
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   598
  $\textit{decode}'\,bs\;(r_1\cdot r_2)$ & $\dn$ &
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   599
        $\textit{let}\,(v_1, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}$\\
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   600
  & &   $\textit{let}\,(v_2, bs_2) = \textit{decode}'\,bs_1\,r_2$
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   601
        \hspace{2mm}$\textit{in}\;(\Seq\,v_1\,v_2, bs_2)$\\
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   602
  $\textit{decode}'\,(\Z\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   603
  $\textit{decode}'\,(\S\!::\!bs)\,(r^*)$ & $\dn$ & 
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   604
         $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   605
  & &   $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^*$
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   606
        \hspace{2mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\bigskip\\
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   607
  $\textit{decode}\,bs\,r$ & $\dn$ &
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   608
     $\textit{let}\,(v, bs') = \textit{decode}'\,bs\,r\;\textit{in}$\\
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   609
  & & \hspace{7mm}$\textit{if}\;bs' = []\;\textit{then}\;\textit{Some}\,v\;
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   610
       \textit{else}\;\textit{None}$   
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   611
  \end{tabular}    
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   612
  \end{center}
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   613
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   614
  \noindent
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   615
  The function \textit{decode} checks whether all of the bitsequence is
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   616
  consumed and returns the corresponding value as @{term "Some v"}; otherwise
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   617
  it fails with @{text "None"}. We can establish that for a value $v$
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   618
  inhabited by a regular expression $r$, the decoding of its
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   619
  bitsequence never fails.
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   620
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   621
\begin{lemma}\label{codedecode}\it
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   622
  If $\;\vdash v : r$ then
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   623
  $\;\textit{decode}\,(\textit{code}\, v)\,r = \textit{Some}\, v$.
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   624
\end{lemma}
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   625
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   626
\begin{proof}
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   627
  This follows from the property that
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   628
  $\textit{decode}'\,((\textit{code}\,v) \,@\, bs)\,r = (v, bs)$ holds
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   629
  for any bit-sequence $bs$ and $\vdash v : r$. This property can be
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   630
  easily proved by induction on $\vdash v : r$.
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   631
\end{proof}  
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   632
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   633
  Sulzmann and Lu define the function \emph{internalise}
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
   634
  in order to transform (standard) regular expressions into annotated
416
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   635
  regular expressions. We write this operation as $r^\uparrow$.
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   636
  This internalisation uses the following
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   637
  \emph{fuse} function.	     
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   638
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   639
  \begin{center}
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   640
  \begin{tabular}{lcl}
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   641
  $\textit{fuse}\,bs\,(\textit{ZERO})$ & $\dn$ & $\textit{ZERO}$\\
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   642
  $\textit{fuse}\,bs\,(\textit{ONE}\,bs')$ & $\dn$ &
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   643
     $\textit{ONE}\,(bs\,@\,bs')$\\
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   644
  $\textit{fuse}\,bs\,(\textit{CHAR}\,bs'\,c)$ & $\dn$ &
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   645
     $\textit{CHAR}\,(bs\,@\,bs')\,c$\\
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   646
  $\textit{fuse}\,bs\,(\textit{ALTs}\,bs'\,rs)$ & $\dn$ &
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   647
     $\textit{ALTs}\,(bs\,@\,bs')\,rs$\\
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   648
  $\textit{fuse}\,bs\,(\textit{SEQ}\,bs'\,r_1\,r_2)$ & $\dn$ &
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   649
     $\textit{SEQ}\,(bs\,@\,bs')\,r_1\,r_2$\\
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   650
  $\textit{fuse}\,bs\,(\textit{STAR}\,bs'\,r)$ & $\dn$ &
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   651
     $\textit{STAR}\,(bs\,@\,bs')\,r$
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   652
  \end{tabular}    
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   653
  \end{center}    
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   654
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   655
  \noindent
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   656
  A regular expression can then be \emph{internalised} into a bitcoded
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
   657
  regular expression as follows:
416
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   658
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   659
  \begin{center}
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   660
  \begin{tabular}{lcl}
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   661
  $(\ZERO)^\uparrow$ & $\dn$ & $\textit{ZERO}$\\
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   662
  $(\ONE)^\uparrow$ & $\dn$ & $\textit{ONE}\,[]$\\
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   663
  $(c)^\uparrow$ & $\dn$ & $\textit{CHAR}\,[]\,c$\\
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   664
  $(r_1 + r_2)^\uparrow$ & $\dn$ &
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   665
         $\textit{ALT}\;[]\,(\textit{fuse}\,[\Z]\,r_1^\uparrow)\,
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   666
                            (\textit{fuse}\,[\S]\,r_2^\uparrow)$\\
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   667
  $(r_1\cdot r_2)^\uparrow$ & $\dn$ &
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   668
         $\textit{SEQ}\;[]\,r_1^\uparrow\,r_2^\uparrow$\\
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   669
  $(r^*)^\uparrow$ & $\dn$ &
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   670
         $\textit{STAR}\;[]\,r^\uparrow$\\
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   671
  \end{tabular}    
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   672
  \end{center}    
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   673
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   674
  \noindent
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   675
  There is also an \emph{erase}-function, written $r^\downarrow$, which
416
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   676
  transforms a bitcoded regular expression into a (standard) regular
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   677
  expression by just erasing the annotated bitsequences. We omit the
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   678
  straightforward definition. For defining the algorithm, we also need
436
222333d2bdc2 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 426
diff changeset
   679
  the functions \textit{bnullable} and \textit{bmkeps}(\textit{s}), which are the
416
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   680
  ``lifted'' versions of \textit{nullable} and \textit{mkeps} acting on
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
   681
  bitcoded regular expressions.
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   682
  %
416
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   683
  \begin{center}
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   684
  \begin{tabular}{@ {}c@ {}c@ {}}
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   685
  \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
458
30c91ea7095b updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 436
diff changeset
   686
  $\textit{bnullable}\,(\textit{ZERO})$ & $\dn$ & $\textit{False}$\\
30c91ea7095b updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 436
diff changeset
   687
  $\textit{bnullable}\,(\textit{ONE}\,bs)$ & $\dn$ & $\textit{True}$\\
30c91ea7095b updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 436
diff changeset
   688
  $\textit{bnullable}\,(\textit{CHAR}\,bs\,c)$ & $\dn$ & $\textit{False}$\\
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   689
  $\textit{bnullable}\,(\textit{ALTs}\,bs\,\rs)$ & $\dn$ &
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   690
     $\exists\, r \in \rs. \,\textit{bnullable}\,r$\\
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   691
  $\textit{bnullable}\,(\textit{SEQ}\,bs\,r_1\,r_2)$ & $\dn$ &
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   692
     $\textit{bnullable}\,r_1\wedge \textit{bnullable}\,r_2$\\
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   693
  $\textit{bnullable}\,(\textit{STAR}\,bs\,r)$ & $\dn$ &
458
30c91ea7095b updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 436
diff changeset
   694
     $\textit{True}$
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   695
  \end{tabular}
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   696
  &
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   697
  \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   698
  $\textit{bmkeps}\,(\textit{ONE}\,bs)$ & $\dn$ & $bs$\\
436
222333d2bdc2 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 426
diff changeset
   699
  $\textit{bmkeps}\,(\textit{ALTs}\,bs\,\rs)$ & $\dn$ &
222333d2bdc2 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 426
diff changeset
   700
  $bs\,@\,\textit{bmkepss}\,\rs$\\
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   701
  $\textit{bmkeps}\,(\textit{SEQ}\,bs\,r_1\,r_2)$ & $\dn$ &\\
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   702
  \multicolumn{3}{r}{$bs \,@\,\textit{bmkeps}\,r_1\,@\, \textit{bmkeps}\,r_2$}\\
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   703
  $\textit{bmkeps}\,(\textit{STAR}\,bs\,r)$ & $\dn$ &
436
222333d2bdc2 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 426
diff changeset
   704
     $bs \,@\, [\S]$\\
222333d2bdc2 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 426
diff changeset
   705
  $\textit{bmkepss}\,(r\!::\!\rs)$ & $\dn$ &
222333d2bdc2 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 426
diff changeset
   706
     $\textit{if}\;\textit{bnullable}\,r$\\
222333d2bdc2 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 426
diff changeset
   707
  & &$\textit{then}\;\textit{bmkeps}\,r$\\
222333d2bdc2 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 426
diff changeset
   708
  & &$\textit{else}\;\textit{bmkepss}\,\rs$
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   709
  \end{tabular}
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   710
  \end{tabular}
416
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   711
  \end{center}    
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   712
 
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   713
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   714
  \noindent
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   715
  The key function in the bitcoded algorithm is the derivative of a
458
30c91ea7095b updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 436
diff changeset
   716
  bitcoded regular expression. This derivative function calculates the
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
   717
  derivative but at the same time also the incremental part of the bitsequences
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   718
  that contribute to constructing a POSIX value.	
416
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   719
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   720
  \begin{center}
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   721
  \begin{tabular}{@ {}lcl@ {}}
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   722
  $(\textit{ZERO})\backslash c$ & $\dn$ & $\textit{ZERO}$ \\  
416
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   723
  $(\textit{ONE}\;bs)\backslash c$ & $\dn$ & $\textit{ZERO}$\\  
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   724
  $(\textit{CHAR}\;bs\,d)\backslash c$ & $\dn$ &
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   725
        $\textit{if}\;c=d\; \;\textit{then}\;
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   726
         \textit{ONE}\;bs\;\textit{else}\;\textit{ZERO}$\\  
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   727
  $(\textit{ALTs}\;bs\,\rs)\backslash c$ & $\dn$ &
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   728
        $\textit{ALTs}\,bs\,(\mathit{map}\,(\_\backslash c)\,\rs)$\\
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   729
  $(\textit{SEQ}\;bs\,r_1\,r_2)\backslash c$ & $\dn$ &
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   730
     $\textit{if}\;\textit{bnullable}\,r_1$\\
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   731
  & &$\textit{then}\;\textit{ALT}\,bs\,(\textit{SEQ}\,[]\,(r_1\backslash c)\,r_2)$\\
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   732
  & &$\phantom{\textit{then}\;\textit{ALT}\,bs\,}(\textit{fuse}\,(\textit{bmkeps}\,r_1)\,(r_2\backslash c))$\\
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   733
  & &$\textit{else}\;\textit{SEQ}\,bs\,(r_1\backslash c)\,r_2$\\
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   734
  $(\textit{STAR}\,bs\,r)\backslash c$ & $\dn$ &
416
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   735
      $\textit{SEQ}\;bs\,(\textit{fuse}\, [\Z] (r\backslash c))\,
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   736
       (\textit{STAR}\,[]\,r)$
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   737
  \end{tabular}    
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   738
  \end{center}
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   739
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   740
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   741
  \noindent
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   742
  This function can also be extended to strings, written $r\backslash s$,
416
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   743
  just like the standard derivative.  We omit the details. Finally we
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   744
  can define Sulzmann and Lu's bitcoded lexer, which we call \textit{blexer}:
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   745
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   746
  \begin{center}
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   747
\begin{tabular}{lcl}
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   748
  $\textit{blexer}\;r\,s$ & $\dn$ &
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   749
      $\textit{let}\;r_{der} = (r^\uparrow)\backslash s\;\textit{in}$\\                
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   750
  & & $\;\;\;\;\textit{if}\; \textit{bnullable}(r_{der}) \;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,r_{der})\,r
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   751
       \;\;\textit{else}\;\textit{None}$
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   752
\end{tabular}
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   753
\end{center}
416
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   754
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   755
  \noindent
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   756
This bitcoded lexer first internalises the regular expression $r$ and then
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   757
builds the bitcoded derivative according to $s$. If the derivative is
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   758
(b)nullable the string is in the language of $r$ and it extracts the bitsequence using the
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   759
$\textit{bmkeps}$ function. Finally it decodes the bitsequence into a value.  If
416
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   760
the derivative is \emph{not} nullable, then $\textit{None}$ is
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   761
returned. We can show that this way of calculating a value
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   762
generates the same result as \textit{lexer}.
416
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   763
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   764
Before we can proceed we need to define a helper function, called
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   765
\textit{retrieve}, which Sulzmann and Lu introduced for the correctness proof.
416
57182b36ec01 more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 410
diff changeset
   766
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   767
\begin{center}
402
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   768
  \begin{tabular}{lcl}
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   769
  @{thm (lhs) retrieve.simps(1)} & $\dn$ & @{thm (rhs) retrieve.simps(1)}\\
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   770
  @{thm (lhs) retrieve.simps(2)} & $\dn$ & @{thm (rhs) retrieve.simps(2)}\\
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   771
  @{thm (lhs) retrieve.simps(3)} & $\dn$ & @{thm (rhs) retrieve.simps(3)}\\
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   772
  @{thm (lhs) better_retrieve(1)} & $\dn$ & @{thm (rhs) better_retrieve(1)}\\
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   773
  @{thm (lhs) better_retrieve(2)} & $\dn$ & @{thm (rhs) better_retrieve(2)}\\
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   774
  @{thm (lhs) retrieve.simps(6)[of _ "r\<^sub>1" "r\<^sub>2" "v\<^sub>1" "v\<^sub>2"]}
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   775
      & $\dn$ & @{thm (rhs) retrieve.simps(6)[of _ "r\<^sub>1" "r\<^sub>2" "v\<^sub>1" "v\<^sub>2"]}\\
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   776
  @{thm (lhs) retrieve.simps(7)} & $\dn$ & @{thm (rhs) retrieve.simps(7)}\\
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   777
  @{thm (lhs) retrieve.simps(8)} & $\dn$ & @{thm (rhs) retrieve.simps(8)}
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   778
  \end{tabular}
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   779
  \end{center}
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   780
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   781
\noindent
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   782
The idea behind this function is to retrieve a possibly partial
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   783
bitsequence from a bitcoded regular expression, where the retrieval is
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   784
guided by a value.  For example if the value is $\Left$ then we
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   785
descend into the left-hand side of an alternative in order to
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   786
assemble the bitcode. Similarly for
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   787
$\Right$. The property we can show is that for a given $v$ and $r$
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   788
with $\vdash v : r$, the retrieved bitsequence from the internalised
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   789
regular expression is equal to the bitcoded version of $v$.
402
1612f2a77bf6 more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 400
diff changeset
   790
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   791
\begin{lemma}\label{retrievecode}
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   792
If $\vdash v : r$ then $\textit{code}\, v = \textit{retrieve}\,(r^\uparrow)\,v$.
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   793
\end{lemma}
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   794
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   795
\noindent
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   796
We also need some auxiliary facts about how the bitcoded operations
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   797
relate to the ``standard'' operations on regular expressions. For
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   798
example if we build a bitcoded derivative and erase the result, this
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   799
is the same as if we first erase the bitcoded regular expression and
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   800
then perform the ``standard'' derivative operation.
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   801
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   802
\begin{lemma}\label{bnullable}\mbox{}\smallskip\\
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   803
  \begin{tabular}{ll}
458
30c91ea7095b updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 436
diff changeset
   804
\textit{(1)} & $(r\backslash s)^\downarrow = (r^\downarrow)\backslash s$\\    
30c91ea7095b updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 436
diff changeset
   805
\textit{(2)} & $\textit{bnullable}(r)$ iff $\textit{nullable}(r^\downarrow)$\\
30c91ea7095b updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 436
diff changeset
   806
\textit{(3)} & $\textit{bmkeps}(r) = \textit{retrieve}\,r\,(\textit{mkeps}\,(r^\downarrow))$ provided $\textit{nullable}(r^\downarrow)$.
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   807
\end{tabular}  
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   808
\end{lemma}
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   809
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   810
\begin{proof}
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   811
  All properties are by induction on annotated regular expressions. There are no
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   812
  interesting cases.
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   813
\end{proof}
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   814
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   815
\noindent
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   816
The only difficulty left for the correctness proof is that the bitcoded algorithm
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   817
has only a ``forward phase'' where POSIX values are generated incrementally.
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
   818
We can achieve the same effect with @{text lexer} (which has two phases) by stacking up injection
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   819
functions during the forward phase. An auxiliary function, called $\textit{flex}$,
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
   820
allows us to recast the rules of $\lexer$ in terms of a single
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   821
phase and stacked up injection functions.
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   822
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   823
\begin{center}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   824
\begin{tabular}{lcl}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   825
  $\textit{flex}\;r\,f\,[]$ & $\dn$ & $f$\\
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   826
  $\textit{flex}\;r\,f\,(c\!::\!s)$ & $\dn$ &
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   827
  $\textit{flex}\,(r\backslash c)\,(\lambda v.\,f\,(\inj\,r\,c\,v))\,s$\\
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   828
\end{tabular}    
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   829
\end{center}    
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   830
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   831
\noindent
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   832
The point of this function is that when
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
   833
reaching the end of the string, we just need to apply the stacked up
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   834
injection functions to the value generated by @{text mkeps}.
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   835
Using this function we can recast the success case in @{text lexer} 
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   836
as follows:
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   837
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   838
\begin{proposition}\label{flex}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   839
If @{text "lexer r s = Some v"} \;then\; @{text "v = "}$\,\textit{flex}\,r\,id\,s\,
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   840
      (\mkeps (r\backslash s))$.
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   841
\end{proposition}
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   842
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   843
\noindent
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   844
Note we did not redefine \textit{lexer}, we just established that the
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   845
value generated by \textit{lexer} can also be obtained by a different
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   846
method. While this different method is not efficient (we essentially
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   847
need to traverse the string $s$ twice, once for building the
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   848
derivative $r\backslash s$ and another time for stacking up injection
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   849
functions using \textit{flex}), it helps us with proving
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
   850
that incrementally building up values in @{text blexer} generates the same result.
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   851
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   852
This brings us to our main lemma in this section: if we calculate a
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
   853
derivative, say $r\backslash s$, and have a value, say $v$, inhabited
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
   854
by this derivative, then we can produce the result @{text lexer} generates
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   855
by applying this value to the stacked-up injection functions
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   856
that $\textit{flex}$ assembles. The lemma establishes that this is the same
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   857
value as if we build the annotated derivative $r^\uparrow\backslash s$
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   858
and then retrieve the corresponding bitcoded version, followed by a
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   859
decoding step.
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   860
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   861
\begin{lemma}[Main Lemma]\label{mainlemma}\it
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   862
If $\vdash v : r\backslash s$ then 
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   863
\[\textit{Some}\,(\textit{flex}\,r\,\textit{id}\,s\,v) =
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   864
  \textit{decode}(\textit{retrieve}\,(r^\uparrow \backslash s)\,v)\,r\]
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   865
\end{lemma}  
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   866
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   867
\begin{proof}
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   868
  This can be proved by induction on $s$ and generalising over
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   869
  $v$. The interesting point is that we need to prove this in the
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   870
  reverse direction for $s$. This means instead of cases $[]$ and
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   871
  $c\!::\!s$, we have cases $[]$ and $s\,@\,[c]$ where we unravel the
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   872
  string from the back.\footnote{Isabelle/HOL provides an induction principle
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   873
    for this way of performing the induction.}
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   874
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   875
  The case for $[]$ is routine using Lemmas~\ref{codedecode}
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   876
  and~\ref{retrievecode}. In the case $s\,@\,[c]$, we can infer from
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   877
  the assumption that $\vdash v : (r\backslash s)\backslash c$
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   878
  holds. Hence by Prop.~\ref{Posix2} we know that 
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   879
  (*) $\vdash \inj\,(r\backslash s)\,c\,v : r\backslash s$ holds too.
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   880
  By definition of $\textit{flex}$ we can unfold the left-hand side
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   881
  to be
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   882
  \[
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   883
    \textit{Some}\,(\textit{flex}\;r\,\textit{id}\,(s\,@\,[c])\,v) =
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   884
    \textit{Some}\,(\textit{flex}\;r\,\textit{id}\,s\,(\inj\,(r\backslash s)\,c\,v))  
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   885
  \]  
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   886
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   887
  \noindent
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   888
  By induction hypothesis and (*) we can rewrite the right-hand side to
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   889
  %
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   890
  \[
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   891
    \textit{decode}\,(\textit{retrieve}\,(r^\uparrow\backslash s)\;
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   892
    (\inj\,(r\backslash s)\,c\,\,v))\,r
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   893
  \]
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   894
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   895
  \noindent
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   896
  which is equal to
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   897
  $\textit{decode}\,(\textit{retrieve}\, (r^\uparrow\backslash
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   898
  (s\,@\,[c]))\,v)\,r$ as required. The last rewrite step is possible
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   899
  because we generalised over $v$ in our induction.
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   900
\end{proof}  
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   901
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   902
\noindent
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   903
With this lemma in place, we can prove the correctness of \textit{blexer}---it indeed
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   904
produces the same result as \textit{lexer}.
405
3cfea5bb5e23 updated some of the text and cardinality proof
Christian Urban <christian.urban@kcl.ac.uk>
parents: 402
diff changeset
   905
3cfea5bb5e23 updated some of the text and cardinality proof
Christian Urban <christian.urban@kcl.ac.uk>
parents: 402
diff changeset
   906
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
   907
\begin{theorem}\label{thmone}
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   908
$\textit{lexer}\,r\,s = \textit{blexer}\,r\,s$
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   909
\end{theorem}  
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   910
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   911
\begin{proof}
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   912
  We can first expand both sides using Prop.~\ref{flex} and the
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   913
  definition of \textit{blexer}. This gives us two
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   914
  \textit{if}-statements, which we need to show to be equal. By 
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   915
  Lemma~\ref{bnullable}\textit{(2)} we know the \textit{if}-tests coincide:
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   916
  \[
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   917
    \textit{bnullable}(r^\uparrow\backslash s) \;\textit{iff}\;
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   918
    \nullable(r\backslash s)
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   919
  \]
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   920
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   921
  \noindent
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   922
  For the \textit{if}-branch suppose $r_d \dn r^\uparrow\backslash s$ and
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   923
  $d \dn r\backslash s$. We have (*) @{text "nullable d"}. We can then show
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   924
  by Lemma~\ref{bnullable}\textit{(3)} that
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   925
  %
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   926
  \[
458
30c91ea7095b updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 436
diff changeset
   927
    \textit{decode}(\textit{bmkeps}\:r_d)\,r =
30c91ea7095b updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 436
diff changeset
   928
    \textit{decode}(\textit{retrieve}\,r_d\,(\textit{mkeps}\,d))\,r
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   929
  \]
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   930
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   931
  \noindent
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   932
  where the right-hand side is equal to
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   933
  $\textit{Some}\,(\textit{flex}\,r\,\textit{id}\,s\,(\textit{mkeps}\,
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   934
  d))$ by Lemma~\ref{mainlemma} (we know
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   935
  $\vdash \textit{mkeps}\,d : d$ by (*)).  This shows the
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   936
  \textit{if}-branches return the same value. In the
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   937
  \textit{else}-branches both \textit{lexer} and \textit{blexer} return
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   938
  \textit{None}. Therefore we can conclude the proof.
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   939
\end{proof}  
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   940
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   941
\noindent This establishes that the bitcoded algorithm by Sulzmann and
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   942
Lu \emph{without} simplification produces correct results. This was
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   943
only conjectured by Sulzmann and Lu in their paper
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   944
\cite{Sulzmann2014}. The next step is to add simplifications.
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   945
397
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   946
*}
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   947
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   948
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   949
section {* Simplification *}
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   950
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
   951
text {*
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
   952
420
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   953
     Derivatives as calculated by Brzozowski’s method are usually more
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
   954
     complex regular expressions than the initial one; the result is
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   955
     that derivative-based matching and lexing algorithms are
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   956
     often abysmally slow if the ``growth problem'' is not addressed. As Sulzmann and Lu wrote, various
423
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   957
     optimisations are possible, such as the simplifications
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   958
     $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r \Rightarrow r$,
b7199d6c672d updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 420
diff changeset
   959
     $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow r$. While these
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   960
     simplifications can considerably speed up the two algorithms  in many
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   961
     cases, they do not solve fundamentally the growth problem with
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   962
     derivatives. To see this let us return to the example from the
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   963
     Introduction that shows the derivatives for \mbox{@{text "(a + aa)\<^sup>*"}}.
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   964
     If we delete in the 3rd step all $\ZERO{}s$ and $\ONE$s according to
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   965
     the simplification rules shown above we obtain
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   966
     %
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   967
     \def\xll{\xrightarrow{\_\backslash{} [a, a, a]}}%%
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   968
     %
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   969
     \begin{equation}\label{derivex}
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   970
     (a + aa)^* \quad\xll\quad
463
421397f267b9 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 462
diff changeset
   971
      \underbrace{\mbox{$(\ONE + a) \cdot (a + aa)^*$}}_{r} \;+\;
421397f267b9 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 462
diff changeset
   972
     ((a + aa)^* + \underbrace{\mbox{$(\ONE + a) \cdot (a + aa)^*$}}_{r})
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   973
     \end{equation}
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   974
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   975
     \noindent This is a simpler derivative, but unfortunately we
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
   976
     cannot make any further simplifications. This is a problem because
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   977
     the outermost alternatives contains two copies of the same
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
   978
     regular expression (underlined with $r$). These copies will
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
   979
     spawn new copies in later derivative steps and they in turn even more copies. This
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
   980
     destroys any hope of taming the size of the derivatives.  But the
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   981
     second copy of $r$ in \eqref{derivex} will never contribute to a
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   982
     value, because POSIX lexing will always prefer matching a string
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
   983
     with the first copy. So it could be safely removed without affecting the correctness of the algorithm.
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   984
     The dilemma with the simple-minded
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   985
     simplification rules above is that the rule $r + r \Rightarrow r$
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   986
     will never be applicable because as can be seen in this example the
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
   987
     regular expressions are not next to each other but separated by another regular expression.
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   988
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   989
     But here is where Sulzmann and Lu's representation of generalised
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   990
     alternatives in the bitcoded algorithm shines: in @{term
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   991
     "ALTs bs rs"} we can define a more aggressive simplification by
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   992
     recursively simplifying all regular expressions in @{text rs} and
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   993
     then analyse the resulting list and remove any duplicates.
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
   994
     Another advantage with the bitsequences in  bitcoded regular
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
   995
     expressions is that they can be easily modified such that simplification does not
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   996
     interfere with the value constructions. For example we can ``flatten'', or
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   997
     de-nest, @{text ALTs} as follows
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   998
     %
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
   999
     \[
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1000
     @{term "ALTs bs\<^sub>1 ((ALTs bs\<^sub>2 rs\<^sub>2) # rs\<^sub>1)"}
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1001
     \quad\xrightarrow{bsimp}\quad
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1002
     @{term "ALTs bs\<^sub>1 ((map (fuse bs\<^sub>2) rs\<^sub>2) # rs\<^sub>1)"}
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1003
     \]
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1004
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1005
     \noindent
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1006
     where we just need to fuse the bitsequence that has accumulated in @{text "bs\<^sub>2"}
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1007
     to the alternatives in @{text "rs\<^sub>2"}. As we shall show below this will
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1008
     ensure that the correct value corresponding to the original (unsimplified)
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1009
     regular expression can still be extracted. %In this way the value construction
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1010
     %is not affected by simplification. 
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1011
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1012
     However there is one problem with the definition for the more
426
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1013
     aggressive simplification rules described by Sulzmann and Lu. Recasting
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1014
     their definition with our syntax they define the step of removing
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1015
     duplicates as
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1016
     %
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1017
     \[ @{text "bsimp (ALTs bs rs)"} \dn @{text "ALTs
458
30c91ea7095b updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 436
diff changeset
  1018
     bs (nub (map bsimp rs))"}
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1019
     \]
458
30c91ea7095b updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 436
diff changeset
  1020
   
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1021
     \noindent where they first recursively simplify the regular
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1022
     expressions in @{text rs} (using @{text map}) and then use
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1023
     Haskell's @{text nub}-function to remove potential
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1024
     duplicates. While this makes sense when considering the example
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1025
     shown in \eqref{derivex}, @{text nub} is the inappropriate
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1026
     function in the case of bitcoded regular expressions. The reason
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1027
     is that in general the elements in @{text rs} will have a
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1028
     different annotated bitsequence and in this way @{text nub}
458
30c91ea7095b updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 436
diff changeset
  1029
     will never find a duplicate to be removed. One correct way to
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1030
     handle this situation is to first \emph{erase} the regular
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1031
     expressions when comparing potential duplicates. This is inspired
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1032
     by Scala's list functions of the form \mbox{@{text "distinctBy rs f
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1033
     acc"}} where a function is applied first before two elements
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1034
     are compared. We define this function in Isabelle/HOL as
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1035
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1036
     \begin{center}
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1037
     \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1038
     @{thm (lhs) distinctBy.simps(1)} & $\dn$ & @{thm (rhs) distinctBy.simps(1)}\\
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1039
     @{thm (lhs) distinctBy.simps(2)} & $\dn$ & @{thm (rhs) distinctBy.simps(2)}
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1040
     \end{tabular}
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1041
     \end{center}
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1042
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1043
     \noindent where we scan the list from left to right (because we
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1044
     have to remove later copies). In @{text distinctBy}, @{text f} is a
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1045
     function and @{text acc} is an accumulator for regular
436
222333d2bdc2 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 426
diff changeset
  1046
     expressions---essentially a set of regular expressions that we have already seen
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1047
     while scanning the list. Therefore we delete an element, say @{text x},
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1048
     from the list provided @{text "f x"} is already in the accumulator;
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1049
     otherwise we keep @{text x} and scan the rest of the list but 
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1050
     add @{text "f x"} as another ``seen'' element to @{text acc}. We will use
458
30c91ea7095b updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 436
diff changeset
  1051
     @{term distinctBy} where @{text f} is the erase function, @{term "erase (DUMMY)"},
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1052
     that deletes bitsequences from bitcoded regular expressions.
461
c4b6906068a9 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 460
diff changeset
  1053
     This is clearly a computationally more expensive operation than @{text nub},
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1054
     but is needed in order to make the removal of unnecessary copies
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1055
     to work properly.
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1056
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1057
     Our simplification function depends on three helper functions, one is called
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1058
     @{text flts} and analyses lists of regular expressions coming from alternatives.
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1059
     It is defined as follows:
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1060
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1061
     \begin{center}
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1062
     \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1063
     @{thm (lhs) flts.simps(1)} & $\dn$ & @{thm (rhs) flts.simps(1)}\\
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1064
     @{thm (lhs) flts.simps(2)} & $\dn$ & @{thm (rhs) flts.simps(2)}\\
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1065
     @{thm (lhs) flts.simps(3)[of "bs'" "rs'"]} & $\dn$ & @{thm (rhs) flts.simps(3)[of "bs'" "rs'"]}\\
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1066
     \end{tabular}
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1067
     \end{center}
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1068
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1069
     \noindent
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1070
     The second clause of @{text flts} removes all instances of @{text ZERO} in alternatives and
458
30c91ea7095b updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 436
diff changeset
  1071
     the third ``spills'' out nested alternatives (but retaining the
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1072
     bitsequence @{text "bs'"} accumulated in the inner alternative). There are
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1073
     some corner cases to be considered when the resulting list inside an alternative is
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1074
     empty or a singleton list. We take care of those cases in the
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1075
     @{text "bsimpALTs"} function; similarly we define a helper function that simplifies
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1076
     sequences according to the usual rules about @{text ZERO}s and @{text ONE}s:
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1077
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1078
     \begin{center}
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1079
     \begin{tabular}{c@ {\hspace{5mm}}c}
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1080
     \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1081
     @{text "bsimpALTs bs []"}  & $\dn$ & @{text "ZERO"}\\
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1082
     @{text "bsimpALTs bs [r]"} & $\dn$ & @{text "fuse bs r"}\\
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1083
     @{text "bsimpALTs bs rs"}  & $\dn$ & @{text "ALTs bs rs"}\\
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1084
     \mbox{}\\
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1085
     \end{tabular}
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1086
     &
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1087
     \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1088
     @{text "bsimpSEQ bs _ ZERO"}  & $\dn$ & @{text "ZERO"}\\
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1089
     @{text "bsimpSEQ bs ZERO _"} & $\dn$ & @{text "ZERO"}\\
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1090
     @{text "bsimpSEQ bs\<^sub>1 (ONE bs\<^sub>2) r\<^sub>2"}
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1091
         & $\dn$ & @{text "fuse (bs\<^sub>1 @ bs\<^sub>2) r\<^sub>2"}\\
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1092
     @{text "bsimpSEQ bs r\<^sub>1 r\<^sub>2"} & $\dn$ &  @{text "SEQ bs r\<^sub>1 r\<^sub>2"}
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1093
     \end{tabular}
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1094
     \end{tabular}
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1095
     \end{center}
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1096
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1097
     \noindent
426
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1098
     With this in place we can define our simplification function as
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1099
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1100
     \begin{center}
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1101
     \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1102
     @{thm (lhs) bsimp.simps(1)[of "bs" "r\<^sub>1" "r\<^sub>2"]} & $\dn$ &
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1103
         @{thm (rhs) bsimp.simps(1)[of "bs" "r\<^sub>1" "r\<^sub>2"]}\\
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1104
     @{thm (lhs) bsimp.simps(2)[of "bs" _]} & $\dn$ & @{thm (rhs) bsimp.simps(2)[of "bs" _]}\\
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1105
     @{text "bsimp r"} & $\dn$ & @{text r}
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1106
     \end{tabular}
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1107
     \end{center}
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1108
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1109
     \noindent
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1110
     As far as we can see, our recursive function @{term bsimp} simplifies regular
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1111
     expressions as intended by Sulzmann and Lu. There is no point in applying the
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1112
     @{text bsimp} function repeatedly (like the simplification in their paper which needs to be
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1113
     applied until a fixpoint is reached) because we can show that @{term bsimp} is idempotent,
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1114
     that is
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1115
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1116
     \begin{proposition}
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1117
     @{term "bsimp (bsimp r) = bsimp r"}
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1118
     \end{proposition}
420
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
  1119
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1120
     \noindent
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1121
     This can be proved by induction on @{text r} but requires a detailed analysis
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1122
     that the de-nesting of alternatives always results in a flat list of regular
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1123
     expressions. We omit the details since it does not concern the correctness proof.
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1124
     
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1125
     Next we can include simplification after each derivative step leading to the
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1126
     following notion of bitcoded derivatives:
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1127
     
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1128
     \begin{center}
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1129
      \begin{tabular}{cc}
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1130
      \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1131
      @{thm (lhs) bders_simp.simps(1)} & $\dn$ & @{thm (rhs) bders_simp.simps(1)}
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1132
      \end{tabular}
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1133
      &
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1134
      \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1135
      @{thm (lhs) bders_simp.simps(2)} & $\dn$ & @{thm (rhs) bders_simp.simps(2)}
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1136
      \end{tabular}
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1137
      \end{tabular}
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1138
      \end{center}
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1139
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1140
      \noindent
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1141
      and use it in the improved lexing algorithm defined as
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1142
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1143
     \begin{center}
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1144
\begin{tabular}{lcl}
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1145
  $\textit{blexer}^+\;r\,s$ & $\dn$ &
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1146
      $\textit{let}\;r_{der} = (r^\uparrow)\backslash_{bsimp}\, s\;\textit{in}$\\                
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1147
  & & $\;\;\;\;\textit{if}\; \textit{bnullable}(r_{der}) \;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,r_{der})\,r
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1148
       \;\;\textit{else}\;\textit{None}$
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1149
\end{tabular}
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1150
\end{center}
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1151
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1152
       \noindent The remaining task is to show that @{term blexer} and
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1153
       @{term "blexer_simp"} generate the same answers.
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1154
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1155
       When we first
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1156
       attempted this proof we encountered a problem with the idea
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1157
       in Sulzmann and Lu's paper where the argument seems to be to appeal
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1158
       again to the @{text retrieve}-function defined for the unsimplified version
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1159
       of the algorithm. But
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1160
       this does not work, because desirable properties such as
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1161
     %
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1162
     \[
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1163
     @{text "retrieve r v = retrieve (bsimp r) v"}
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1164
     \]
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1165
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1166
     \noindent do not hold under simplification---this property
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1167
     essentially purports that we can retrieve the same value from a
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1168
     simplified version of the regular expression. To start with @{text retrieve}
474
726f4e65c0fe made paper changes after ITP comments
Christian Urban <christian.urban@kcl.ac.uk>
parents: 464
diff changeset
  1169
     depends on the fact that the value @{text v} corresponds to the
464
e6248d2c20c2 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 463
diff changeset
  1170
     structure of the regular expression @{text r}---but the whole point of simplification
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1171
     is to ``destroy'' this structure by making the regular expression simpler.
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1172
     To see this consider the regular expression @{text "r = r' + 0"} and a corresponding
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1173
     value @{text "v = Left v'"}. If we annotate bitcodes to @{text "r"}, then 
464
e6248d2c20c2 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 463
diff changeset
  1174
     we can use @{text retrieve} with @{text r} and @{text v} in order to extract a corresponding
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1175
     bitsequence. The reason that this works is that @{text r} is an alternative
464
e6248d2c20c2 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 463
diff changeset
  1176
     regular expression and @{text v} a corresponding @{text "Left"}-value. However, if we simplify
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1177
     @{text r}, then @{text v} does not correspond to the shape of the regular 
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1178
     expression anymore. So unless one can somehow
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1179
     synchronise the change in the simplified regular expressions with
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1180
     the original POSIX value, there is no hope of appealing to @{text retrieve} in the
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1181
     correctness argument for @{term blexer_simp}.
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1182
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1183
     We found it more helpful to introduce the rewriting systems shown in
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1184
     Figure~\ref{SimpRewrites}. The idea is to generate 
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1185
     simplified regular expressions in small steps (unlike the @{text bsimp}-function which
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1186
     does the same in a big step), and show that each of
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1187
     the small steps preserves the bitcodes that lead to the final POSIX value.
436
222333d2bdc2 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 426
diff changeset
  1188
     The rewrite system is organised such that $\leadsto$ is for bitcoded regular
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1189
     expressions and $\stackrel{s}{\leadsto}$ for lists of bitcoded regular
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1190
     expressions. The former essentially implements the simplifications of
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1191
     @{text "bsimpSEQ"} and @{text flts}; while the latter implements the
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1192
     simplifications in @{text "bsimpALTs"}. We can show that any bitcoded
436
222333d2bdc2 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 426
diff changeset
  1193
     regular expression reduces in zero or more steps to the simplified
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1194
     regular expression generated by @{text bsimp}:
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1195
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1196
     \begin{lemma}\label{lemone}
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1197
     @{thm[mode=IfThen] rewrites_to_bsimp}
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1198
     \end{lemma}
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1199
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1200
     \begin{proof}
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1201
     By induction on @{text r}. For this we can use the properties
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1202
     @{thm fltsfrewrites} and @{thm ss6_stronger}. The latter uses
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1203
     repeated applications of the $LD$ rule which allows the removal
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1204
     of duplicates that can recognise the same strings. 
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1205
     \end{proof}
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1206
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1207
     \noindent
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1208
     We can show that this rewrite system preserves @{term bnullable}, that 
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1209
     is simplification, essentially, does not affect nullability:
420
b66a4305749c updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 418
diff changeset
  1210
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
  1211
     \begin{lemma}
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
  1212
     @{thm[mode=IfThen] bnullable0(1)[of "r\<^sub>1" "r\<^sub>2"]}
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
  1213
     \end{lemma}
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
  1214
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1215
     \begin{proof}
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1216
     Straightforward mutual induction on the definition of $\leadsto$ and $\stackrel{s}{\leadsto}$.
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1217
     The only interesting case is the rule $LD$ where the property holds since by the side-conditions of that rule the empty string will
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1218
     be in both @{text "L (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ [r\<^sub>2] @ rs\<^sub>c)"} and
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1219
     @{text "L (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ rs\<^sub>c)"}.
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1220
     \end{proof}
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
  1221
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1222
     \noindent
426
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1223
     From this, we can show that @{text bmkeps} will produce the same bitsequence
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1224
     as long as one of the bitcoded regular expressions in $\leadsto$ is nullable (this lemma
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1225
     establishes the missing fact we were not able to establish using @{text retrieve}, as suggested
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1226
     in the paper by Sulzmannn and Lu). 
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1227
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1228
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1229
     \begin{lemma}\label{lemthree}
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
  1230
     @{thm[mode=IfThen] rewrite_bmkeps_aux(1)[of "r\<^sub>1" "r\<^sub>2"]}
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
  1231
     \end{lemma}
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
  1232
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1233
     \begin{proof}
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1234
     By straightforward mutual induction on the definition of $\leadsto$ and $\stackrel{s}{\leadsto}$.
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1235
     Again the only interesting case is the rule $LD$ where we need to ensure that
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1236
     \[
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1237
     @{text "bmkeps (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ [r\<^sub>2] @ rs\<^sub>c) =
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1238
        bmkeps (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ rs\<^sub>c)"}	
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1239
     \]
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1240
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1241
     \noindent holds. This is indeed the case because according to the POSIX rules the
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1242
     generated bitsequence is determined by the first alternative that can match the
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1243
     string (in this case being nullable).
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1244
     \end{proof}
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1245
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1246
     \noindent
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1247
     Crucial is also the fact that derivative steps and simplification steps can be interleaved,
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1248
     which is shown by the fact that $\leadsto$ is preserved under derivatives.
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
  1249
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
  1250
     \begin{lemma}
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
  1251
     @{thm[mode=IfThen] rewrite_preserves_bder(1)[of "r\<^sub>1" "r\<^sub>2"]}
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
  1252
     \end{lemma}
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
  1253
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1254
     \begin{proof}
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1255
     By straightforward mutual induction on the definition of $\leadsto$ and $\stackrel{s}{\leadsto}$.
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1256
     The case for $LD$ holds because @{term "L (erase (bder c r\<^sub>2)) \<subseteq> L (erase (bder c r\<^sub>1))"}
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1257
     if and only if @{term "L (erase (r\<^sub>2)) \<subseteq> L (erase (r\<^sub>1))"}.
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1258
     \end{proof}
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1259
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1260
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1261
     \noindent
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1262
     Using this fact together with Lemma~\ref{lemone} allows us to prove the central lemma
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1263
     that the unsimplified
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1264
     derivative (with a string @{term s}) reduces to the simplified derivative (with the same string).
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1265
     
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1266
     \begin{lemma}\label{lemtwo}
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
  1267
     @{thm[mode=IfThen] central}
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
  1268
     \end{lemma}
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
  1269
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1270
     \begin{proof}
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1271
     By reverse induction on @{term s} generalising over @{text r}.
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1272
     \end{proof}
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1273
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1274
     \noindent
426
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1275
     With these lemmas in place we can finally establish that @{term "blexer_simp"} and @{term "blexer"}
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1276
     generate the same value, and using Theorem~\ref{thmone} from the previous section that this value
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1277
     is indeed the POSIX value.
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1278
     
418
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
  1279
     \begin{theorem}
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
  1280
     @{thm[mode=IfThen] main_blexer_simp}
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
  1281
     \end{theorem}
41a2a3b63853 more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 416
diff changeset
  1282
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1283
     \begin{proof}
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1284
     By unfolding the definitions and using Lemmas~\ref{lemtwo} and \ref{lemthree}. 	
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1285
     \end{proof}
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1286
     
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1287
     \noindent
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1288
     This completes the correctness proof for the second POSIX lexing algorithm by Sulzmann and Lu.
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1289
     The interesting point of this algorithm is that the sizes of derivatives do not grow arbitrarily, which
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1290
     we shall show next.
398
dac6d27c99c6 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 397
diff changeset
  1291
dac6d27c99c6 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 397
diff changeset
  1292
   \begin{figure}[t]
dac6d27c99c6 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 397
diff changeset
  1293
  \begin{center}
dac6d27c99c6 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 397
diff changeset
  1294
  \begin{tabular}{c}
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1295
  @{thm[mode=Axiom] bs1[of _ "r\<^sub>2"]}$S\ZERO{}_l$\qquad
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1296
  @{thm[mode=Axiom] bs2[of _ "r\<^sub>1"]}$S\ZERO{}_r$\\
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1297
  @{thm[mode=Axiom] bs3[of "bs\<^sub>1" "bs\<^sub>2"]}$S\ONE$\\
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1298
  @{thm[mode=Rule] bs4[of "r\<^sub>1" "r\<^sub>2" _ "r\<^sub>3"]}SL\qquad
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1299
  @{thm[mode=Rule] bs5[of "r\<^sub>3" "r\<^sub>4" _ "r\<^sub>1"]}SR\\
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1300
  @{thm[mode=Axiom] bs6}$A0$\qquad
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1301
  @{thm[mode=Axiom] bs7}$A1$\\
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1302
  @{thm[mode=Rule] bs8[of "rs\<^sub>1" "rs\<^sub>2"]}$AL$\\
474
726f4e65c0fe made paper changes after ITP comments
Christian Urban <christian.urban@kcl.ac.uk>
parents: 464
diff changeset
  1303
  @{thm[mode=Rule] ss2[of "rs\<^sub>1" "rs\<^sub>2"]}$LT$\qquad
726f4e65c0fe made paper changes after ITP comments
Christian Urban <christian.urban@kcl.ac.uk>
parents: 464
diff changeset
  1304
  @{thm[mode=Rule] ss3[of "r\<^sub>1" "r\<^sub>2"]}$LH$\\
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1305
  @{thm[mode=Axiom] ss4}$L\ZERO$\qquad
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1306
  @{thm[mode=Axiom] ss5[of "bs" "rs\<^sub>1" "rs\<^sub>2"]}$LS$\medskip\\
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1307
  @{thm[mode=Rule] ss6[of "r\<^sub>2" "r\<^sub>1" "rs\<^sub>1" "rs\<^sub>2" "rs\<^sub>3"]}$LD$\\
398
dac6d27c99c6 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 397
diff changeset
  1308
  \end{tabular}
dac6d27c99c6 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 397
diff changeset
  1309
  \end{center}
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1310
  \caption{The rewrite rules that generate simplified regular expressions
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1311
  in small steps: @{term "rrewrite r\<^sub>1 r\<^sub>2"} is for bitcoded regular
461
c4b6906068a9 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 460
diff changeset
  1312
  expressions and @{term "srewrite rs\<^sub>1 rs\<^sub>2"} for \emph{lists} of bitcoded
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1313
  regular expressions. Interesting is the $LD$ rule that allows copies of regular
461
c4b6906068a9 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 460
diff changeset
  1314
  expressions to be removed provided a regular expression earlier in the list can
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1315
  match the same strings.}\label{SimpRewrites}
398
dac6d27c99c6 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 397
diff changeset
  1316
  \end{figure}
397
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
  1317
*}
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
  1318
426
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1319
section {* Finiteness of Derivatives *}
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1320
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1321
text {*
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1322
436
222333d2bdc2 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 426
diff changeset
  1323
In this section let us sketch our argument for why the size of the simplified
426
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1324
derivatives with the aggressive simplification function is finite. Suppose
436
222333d2bdc2 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 426
diff changeset
  1325
we have a size function for bitcoded regular expressions, written
426
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1326
@{text "|r|"}, which counts the number of nodes if we regard $r$ as a tree
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1327
(we omit the precise definition). For this we show that for every $r$
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1328
there exists a bound $N$
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1329
such that 
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1330
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1331
\begin{center}
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1332
$\forall s. \; |@{term "bders_simp r s"}| < N$
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1333
\end{center}
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1334
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1335
\noindent
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1336
We prove this by induction on $r$. The base cases for @{term AZERO},
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1337
@{term "AONE bs"} and @{term "ACHAR bs c"} are straightforward. The interesting case is
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1338
for sequences of the form @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}. In this case our induction
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1339
hypotheses state $\forall s. \; |@{term "bders_simp r\<^sub>1 s"}| < N_1$ and
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1340
$\forall s. \; |@{term "bders_simp r\<^sub>2 s"}| < N_2$. We can reason as follows
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1341
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1342
\begin{center}
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1343
\begin{tabular}{lcll}
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1344
& & $ |@{term "bders_simp (ASEQ bs r\<^sub>1 r\<^sub>2) s"}|$\\
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1345
& $ = $ & $|bsimp(ALTs\;bs\;((@{term "bders_simp r\<^sub>1 s"}) \cdot r_2) ::
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1346
    [@{term "bders_simp r\<^sub>2 s'"} \;|\; s' \in Suf\!fix(s)])| $ & (1) \\
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1347
& $\leq$ &
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1348
    $|distinctBy\,(flts\,((@{term "bders_simp r\<^sub>1 s "}) \cdot r_2) ::
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1349
    [@{term "bders_simp r\<^sub>2 s'"} \;|\; s' \in Suf\!fix(s)])| + 1 $ & (2) \\
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1350
& $\leq$ & $|(@{term "bders_simp r\<^sub>1 s"}) \cdot r_2| +
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1351
             |distinctBy\,(flts\,   [@{term "bders_simp r\<^sub>2 s'"} \;|\; s' \in Suf\!fix(s)])| + 1 $ & (3) \\
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1352
& $\leq$ & $N_1 + |r_2| + 2 + |distinctBy\,(flts\,   [@{term "bders_simp r\<^sub>2 s'"} \;|\; s' \in Suf\!fix(s)])|$ & (4)\\
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1353
& $\leq$ & $N_1 + |r_2| + 2 + l_{N_{2}} * N_{2}$ & (5)
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1354
\end{tabular}
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1355
\end{center}
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1356
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1357
% tell Chengsong about Indian paper of closed forms of derivatives
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1358
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1359
\noindent
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1360
where in (1) the $Suf\!fix(s')$ are the suffixes where @{term "bders_simp r\<^sub>1 s''"} is nullable for
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1361
@{text "s = s'' @ s'"}. In (3) we know that  $|(@{term "bders_simp r\<^sub>1 s"}) \cdot r_2|$ is 
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1362
bounded by $N_1 + |r_2|$. In (5) we know the list comprehension contains only regular expressions of size smaller
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1363
than $N_2$. The list length after @{text distinctBy} is bounded by a number, which we call $l_{N_2}$. It stands
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1364
for the number of distinct regular expressions with a maximum size $N_2$ (there can only be finitely many of them).
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1365
We reason similarly in the @{text Star}-case.\medskip
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1366
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1367
\noindent
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1368
Clearly we give in this finiteness argument (Step (5)) a very loose bound that is
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1369
far from the actual bound we can expect. We can do better than this, but this does not improve
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1370
the finiteness property we are proving. If we are interested in a polynomial bound,
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1371
one would hope to obtain a similar tight bound as for partial
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1372
derivatives introduced by Antimirov \cite{Antimirov95}. After all the idea with
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1373
@{text distinctBy} is to maintain a ``set'' of alternatives (like the sets in
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1374
partial derivatives). Unfortunately to obtain the exact same bound would mean
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1375
we need to introduce simplifications such as
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1376
%
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1377
\[ (r_1 + r_2) \cdot r_3 \longrightarrow (r_1 \cdot r_3) + (r_2 \cdot r_3)
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1378
\]
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1379
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1380
\noindent
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1381
which exist for partial derivatives. However, if we introduce them in our
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1382
setting we would lose the POSIX property of our calculated values. We leave better
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1383
bounds for future work.
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1384
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1385
*}
397
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
  1386
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
  1387
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
  1388
section {* Conclusion *}
e1b74d618f1b updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents: 396
diff changeset
  1389
396
cc8e231529fb added ITP paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1390
text {*
cc8e231529fb added ITP paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1391
426
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1392
   We set out in this work to prove in Isabelle/HOL the correctness of
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1393
   the second POSIX lexing algorithm by Sulzmann and Lu
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1394
   \cite{Sulzmann2014}. This follows earlier work where we established
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1395
   the correctness of the first algorithm
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1396
   \cite{AusafDyckhoffUrban2016}. In the earlier work we needed to
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1397
   introduce our own specification about what POSIX values are,
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1398
   because the informal definition given by Sulzmann and Lu did not
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1399
   stand up to a formal proof. Also for the second algorithm we needed
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1400
   to introduce our own definitions and proof ideas in order to establish the
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1401
   correctness.  Our interest in the second algorithm 
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1402
   lies in the fact that by using bitcoded regular expressions and an aggressive
474
726f4e65c0fe made paper changes after ITP comments
Christian Urban <christian.urban@kcl.ac.uk>
parents: 464
diff changeset
  1403
   simplification method there is a chance that the derivatives
426
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1404
   can be kept universally small  (we established in this paper that
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1405
   they can be kept finite for any string). This is important if one is after
464
e6248d2c20c2 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 463
diff changeset
  1406
   an efficient POSIX lexing algorithm based on derivatives.
425
14c558ae0b09 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 424
diff changeset
  1407
426
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1408
   Having proved the correctness of the POSIX lexing algorithm, which
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1409
   lessons have we learned? Well, we feel this is a very good example
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1410
   where formal proofs give further insight into the matter at
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1411
   hand. For example it is very hard to see a problem with @{text nub}
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1412
   vs @{text distinctBy} with only experimental data---one would still
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1413
   see the correct result but find that simplification does not simplify in well-chosen, but not
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1414
   obscure, examples. We found that from an implementation
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1415
   point-of-view it is really important to have the formal proofs of
462
d9b672c4c0ac updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 461
diff changeset
  1416
   the corresponding properties at hand.
d9b672c4c0ac updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 461
diff changeset
  1417
d9b672c4c0ac updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 461
diff changeset
  1418
   We have also developed a
426
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1419
   healthy suspicion when experimental data is used to back up
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1420
   efficiency claims. For example Sulzmann and Lu write about their
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1421
   equivalent of @{term blexer_simp} ``...we can incrementally compute
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1422
   bitcoded parse trees in linear time in the size of the input''
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1423
   \cite[Page 14]{Sulzmann2014}. 
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1424
   Given the growth of the
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1425
   derivatives in some cases even after aggressive simplification, this
464
e6248d2c20c2 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 463
diff changeset
  1426
   is a hard to believe claim. A similar claim about a theoretical runtime
459
484403cf0c9d updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 458
diff changeset
  1427
   of @{text "O(n\<^sup>2)"} is made for the Verbatim lexer, which calculates
474
726f4e65c0fe made paper changes after ITP comments
Christian Urban <christian.urban@kcl.ac.uk>
parents: 464
diff changeset
  1428
   tokens according to POSIX rules~\cite{verbatim}. For this Verbatim uses Brzozowski's
462
d9b672c4c0ac updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 461
diff changeset
  1429
   derivatives like in our work. 
474
726f4e65c0fe made paper changes after ITP comments
Christian Urban <christian.urban@kcl.ac.uk>
parents: 464
diff changeset
  1430
   The authors write: ``The results of our empirical tests [..] confirm that Verbatim has
461
c4b6906068a9 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 460
diff changeset
  1431
   @{text "O(n\<^sup>2)"} time complexity.'' \cite[Section~VII]{verbatim}.
459
484403cf0c9d updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 458
diff changeset
  1432
   While their correctness proof for Verbatim is formalised in Coq, the claim about
461
c4b6906068a9 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 460
diff changeset
  1433
   the runtime complexity is only supported by some emperical evidence obtained
c4b6906068a9 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 460
diff changeset
  1434
   by using the code extraction facilities of Coq.
464
e6248d2c20c2 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 463
diff changeset
  1435
   Given our observation with the ``growth problem'' of derivatives,
460
6e269f557fc5 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 459
diff changeset
  1436
   we
6e269f557fc5 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 459
diff changeset
  1437
   tried out their extracted OCaml code with the example
6e269f557fc5 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 459
diff changeset
  1438
   \mbox{@{text "(a + aa)\<^sup>*"}} as a single lexing rule, and it took for us around 5 minutes to tokenise a
459
484403cf0c9d updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 458
diff changeset
  1439
   string of 40 $a$'s and that increased to approximately 19 minutes when the
464
e6248d2c20c2 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 463
diff changeset
  1440
   string is 50 $a$'s long. Taking into account that derivatives are not simplified in the Verbatim
460
6e269f557fc5 updated paper
Christian Urban <christian.urban@kcl.ac.uk>
parents: 459
diff changeset
  1441
   lexer, such numbers are not surprising. 
426
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1442
   Clearly our result of having finite
459
484403cf0c9d updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 458
diff changeset
  1443
   derivatives might sound rather weak in this context but we think such effeciency claims
484403cf0c9d updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 458
diff changeset
  1444
   really require further scrutiny.\medskip
426
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1445
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1446
   \noindent
5b77220fdf01 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 425
diff changeset
  1447
   Our Isabelle/HOL code is available under \url{https://github.com/urbanchr/posix}.
424
2416fdec6396 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 423
diff changeset
  1448
396
cc8e231529fb added ITP paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1449
cc8e231529fb added ITP paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1450
%%\bibliographystyle{plain}
cc8e231529fb added ITP paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1451
\bibliography{root}
cc8e231529fb added ITP paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1452
*}
cc8e231529fb added ITP paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1453
cc8e231529fb added ITP paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1454
(*<*)
cc8e231529fb added ITP paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1455
end
cc8e231529fb added ITP paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
  1456
(*>*)