regexp.lang: Languages

Overview

Let A be a type representing the alphabet. Strings over A are captured by the type seq A of finite sequences over A and languages as predicates over seq A.

lang A: Languages over the alphabet set A. Languages support the following operations. They are predicates on seq A.

ε: the language consisting of only the empty sequence [::],
singleton a: For a : A the language consisting of the single letter string [::a],
satisfy pr: the language of single letter strings on which the predicate pr : A → bool evaluates to true.
L1 ≡ L2: The language L1 and L2 denote the same languages. For the automata theoretic setting it is this extensional notion of equivalence that is the right notion and not the intensional =. This relation is declared as a generalized equivalence relation and supports generalized rewriting.

Language operations

L1 || L2: the union of languages .
L1 ++ L2: the concatenation of languages
L^n or pow L n: the -fold concatenation of , i.e
many L: the Kleene closure
many1 L: the language

Co-functor structure on lang

comap: For f : A → B gives the contravariant lift comap f : lang B → lang A. Comap preserve all the operations on languages and supports generalized rewriting.
- It composes (contravariantly) comap f (comap g L) ≡ comap (g \o f) L.
- It preserves all the natural language theoretic operations that we defined above. For example, comap f (L1 ++ L2) ≡ comap f L1 ++ comap f L2

All the language operations and the comap function are declared as Morphism for the language equivalence relation ≡.

Definition lang A := seq A → Prop.

Definition epsilon {A} : lang A
  := fun xs ⇒ xs = [::].

Definition singleton {A} (a : A) : lang A
  := fun xs ⇒ xs = [:: a].

Definition satisfy {A} (tf : A → bool) : lang A
  := fun xs ⇒ ∃ a : A , xs = [:: a] ∧ tf a.

Definition alt{A}(L1 L2 : lang A) : lang A
  := fun xs ⇒ L1 xs ∨ L2 xs.

Definition concat{A}(L1 L2 : lang A) : lang A
  := fun xs ⇒ ∃ xs1 xs2 , (xs = xs1 ++ xs2 ∧ L1 xs1 ∧ L2 xs2 ).

Infix "||" := alt : language_scope.
Infix "++" := concat : language_scope.
Notation "'ε'" := epsilon.

Fixpoint pow {A}(L : lang A)(n : nat) : lang A :=
  if n is m.+1 then (L ++ pow L m)%lan
  else ε.

Infix "^" := pow : language_scope.

Definition many{A}(L : lang A) : lang A := fun xs ⇒ ∃ n , (L ^ n)%lan xs.
Definition many1{A}(L : lang A) : lang A := fun xs ⇒ ∃ n , (L ^ (n.+1))%lan xs.

Language equivalence.

The heart of most of our proofs involve proving two (potentially different) languages L1 and L2 are essentially the same. Since the languages might be defined differently (one coming from a regular expression the other from an automata) the most natural notion is the extensional equivalence of the underlying predicates.

Module eq.

  Section Cxt.
    Context {A : Type}.

    Definition rel (L1 L2 : lang A) := ∀ xs, L1 xs ↔ L2 xs.
    Lemma refl : reflexive (lang A) rel.
      done.
    Qed.

    Lemma sym : symmetric (lang A) rel.
      by move ⇒ L1 L2 h xs; rewrite h.
    Qed.

    Lemma trans : transitive (lang A) rel.
      by move ⇒ L1 L2 L3 h12 h23 xs; rewrite h12 h23.
    Qed.
  End Cxt.

End eq.

Add Parametric Relation A : (lang A) (@eq.rel A)
  reflexivity proved by eq.refl symmetry proved by eq.sym transitivity proved by eq.trans as lang_equivalence.

Add Parametric Morphism A : (@alt A) with signature
    (@eq.rel A ) ==> (@eq.rel A ) ==> (@eq.rel A ) as lang_alt_morphism.
  by move ⇒ L1 L2 H12 L3 L4 H34 xs; rewrite /alt (H12 xs) (H34 xs).
Qed.

Infix "≡" := eq.rel (at level 70).

Lemma satisfy_ext [A][f g : A → bool] : f =1 g → (satisfy f ≡ satisfy g)%lan.
Proof.
  move ⇒ Hfg xs; rewrite /satisfy; split; case ⇒ a [hxs hyp]; ∃ a.
  - by rewrite -Hfg.
  - by rewrite Hfg.
Qed.

Add Parametric Morphism A : (@concat A) with signature
    (@eq.rel A ) ==> (@eq.rel A ) ==> (@eq.rel A ) as lang_concat_morphism.
  move ⇒ L1 L2 H12 L3 L4 H34 xs; rewrite /concat; split; case ⇒ ys [zs [hxs]].
  - by rewrite (H12 ys) (H34 zs); case ⇒ h1 h2; ∃ ys; ∃ zs.
  - by rewrite -(H12 ys) -(H34 zs); case ⇒ h1 h2; ∃ ys; ∃ zs.
Qed.

Lemma concat_epsilon {A} (L : lang A) : (L ++ ε)%lan ≡ L.
  rewrite /concat; move ⇒ xs; split.
  - by case ⇒ ys [zs [hxs [hys hzs]]]; rewrite hxs hzs cats0.
  - by move ⇒ h; ∃ xs; ∃ [::]; rewrite cats0 //= /epsilon.
Qed.

Lemma epsilon_concat {A} (L : lang A) : (ε ++ L)%lan ≡ L.
  rewrite /concat; move ⇒ xs; rewrite /epsilon; split.
  - by case ⇒ ys [zs [hxs [hys hzs]]]; rewrite hxs hys cat0s.
  - by move ⇒ h; ∃ [::]; ∃ xs; rewrite cat0s //= /epsilon.
Qed.

Add Parametric Morphism A : (@pow A) with signature
    (@eq.rel A ) ==> eq ==> (@eq.rel A ) as lang_pow_morphism.
  move ⇒ L1 L2 H n.
  elim: n ⇒ [|n IHn]; by do 2 rewrite //= ?H ?IHn.
Qed.

Add Parametric Morphism A : (@many A) with signature
    (@eq.rel A ) ==> (@eq.rel A ) as lang_many_morphism.
  move ⇒ L1 L2 H.
  have hpow : ∀ n, (L1 ^ n ≡ L2 ^ n)%lan by move ⇒ n; rewrite H.
  rewrite /many ⇒ xs; split; case ⇒ n.
  - by rewrite (hpow n xs); ∃ n.
  - by rewrite -(hpow n xs); ∃ n.
Qed.

Add Parametric Morphism A : (@many1 A) with signature
    (@eq.rel A ) ==> (@eq.rel A ) as lang_many1_morphism.
  move ⇒ L1 L2 H.
  have hpow : ∀ n, (L1 ^ n ≡ L2 ^ n)%lan by move ⇒ n; rewrite H.
  rewrite /many ⇒ xs; split; case ⇒ n.
  - by rewrite (hpow (n.+1) xs); ∃ n.
  - by rewrite -(hpow (n.+1) xs); ∃ n.
Qed.

From Stdlib Require List.

Lemma pow0 {A}{L : lang A} {xs} : (L ^ 0)%lan xs → xs = [::].
  rewrite //.
Qed.

Lemma pow1 {A}(L : lang A) : (L ^ 1 ≡ L)%lan.
  by rewrite /pow concat_epsilon.
Qed.

Lemma many1_nil {A}(L : lang A) : many1 L [::] → L [::].
  case ⇒ [n]; rewrite //=; case ⇒ ys [zs [hnil [hys hzs]]].
  have hyp : ys ++ zs = [::] by rewrite -hnil.
  set hnils := List.app_eq_nil ys zs hyp.
  case: hnils ⇒ h1 h2; by rewrite -h1.
Qed.

Contravariant lift of alphabet transformations.

Function composition (give by the operator \o) allows us to lift functions f : A → B on the underlying alphabets types A and B to the functions comap f: lang B → lang A between their language family.

Definition comap {A B : Type}(f : A → B) (L : lang B) : lang A :=
L \o map f.

Section Comap.
  Context {A B}(f : A → B).

  Lemma comap_epsilon : comap f ε ≡ ε.
    move ⇒ xs; split.
    - rewrite /comap /comp.
      by move /List.map_eq_nil.
    - by move ⇒ Heps; rewrite Heps //=.
  Qed.

  Lemma comap_alt (L1 L2 : lang B) : (comap f (L1 || L2) ≡ comap f L1 || comap f L2)%lan.
    move ⇒ xs; rewrite /comap //=.
  Qed.

  Lemma comap_satisfy {pr : B → bool} : (comap f (satisfy pr) ≡ satisfy (pr \o f))%lan.
    move ⇒ xs; rewrite /satisfy /comap /comp; split.
    - case ⇒ b [hxs hpr].
      by case (map_single hxs) ⇒ a [h1 hf]; ∃ a; rewrite h1 -hf.
    - case ⇒ a [hxs hpr]; ∃ (f a); by rewrite hxs.
  Qed.

  Lemma comap_concat (L1 L2 : lang B) : (comap f (L1 ++ L2) ≡ comap f L1 ++ comap f L2)%lan.
    move ⇒ xs; rewrite /comap //= /concat /comp; split.
    - case ⇒ ys1 [ys2 [Hconcat [H1 H2]]].
      case (map_catE Hconcat) ⇒ xs1 [xs2 [Hxsconcat [Hm1 Hm2]]].
      ∃ xs1; ∃ xs2; by rewrite //= Hm1 Hm2.
    - case ⇒ xs1 [xs2 [Hxs [HL1 HL2]]].
      ∃ [seq f x | x <- xs1]; ∃ [seq f x | x <- xs2]; by rewrite Hxs map_cat.
  Qed.

  Lemma comap_pow [L : lang B] : ∀ n, (comap f (L ^n) ≡ (comap f L )^n)%lan .
    elim ⇒ [|n IH].
    - by rewrite comap_epsilon.
    - by rewrite comap_concat IH.
  Qed.

  Lemma comap_many1 (L : lang B) : (comap f (many1 L) ≡ many1 (comap f L))%lan.
    move ⇒ xs; split.
    - set lhs := many1 _ _; rewrite /comap; case ⇒ n Hyp.
      by ∃ n; rewrite -(comap_pow _ xs) /comap /comp.
    - case ⇒ n; rewrite -(comap_pow _ xs) /comap. by ∃ n.
  Qed.
End Comap.

Lemma comap_comap {A B C}(f : B → A)(g : C → B) : ∀ L : lang A, comap (f \o g) L ≡ comap g (comap f L).
  move ⇒ L xs; by rewrite /comap ?/comp map_comp.
Qed.

Add Parametric Morphism A B : comap with signature
    (@eqfun A (fun _ ⇒ B)) ==> (@eq.rel B ) ==> (@eq.rel A ) as lang_comap_morphism.
  move ⇒ f g hfg L1 L2 H xs; by rewrite /comap /comp (H _) (eq_map hfg) .
Qed.