Wiki Agenda Contact Version française

Edition distance

Correctness of a program computing the minimal distance between two words.


Authors: Claude Marché / Jean-Christophe Filliâtre

Topics: Array Data Structure / Inductive predicates

Tools: Why3

see also the index (by topic, by tool, by reference, by year)


(* Correctness of a program computing the minimal distance between
   two words (code by Claude Marché).

   This program computes a variant of the Levenshtein distance. Given
   two strings [w1] and [w2] of respective lengths [n1] and [n2], it
   computes the minimal numbers of insertions and deletions to
   perform in one of the strings to get the other one.  (The
   traditional edit distance also includes substitutions.)

   The nice point about this algorithm, due to Claude Marché, is to
   work in linear space, in an array of min(n1,n2) integers. Time
   complexity is O(n1 * n2), as usual.
*)

theory Word

  use export int.Int
  use export int.MinMax
  use export list.List
  use export list.Length

  type char
  type word = list char

  inductive dist word word int =
  | dist_eps :
      dist Nil Nil 0
  | dist_add_left :
      forall w1 w2: word, n: int.
      dist w1 w2 n -> forall a: char. dist (Cons a w1) w2 (n + 1)
  | dist_add_right :
      forall w1 w2: word, n: int.
      dist w1 w2 n -> forall a: char. dist w1 (Cons a w2) (n + 1)
  | dist_context :
      forall w1 w2:word, n: int.
      dist w1 w2 n -> forall a: char. dist (Cons a w1) (Cons a w2) n

  predicate min_dist (w1 w2: word) (n: int) =
    dist w1 w2 n /\ forall m: int. dist w1 w2 m -> n <= m

  (* intermediate lemmas *)

  use export list.Append

  function last_char (a: char) (u: word) : char = match u with
    | Nil -> a
    | Cons c u' -> last_char c u'
  end

  function but_last (a: char) (u: word) : word = match u with
    | Nil -> Nil
    | Cons c u' -> Cons a (but_last c u')
  end

  lemma first_last_explicit:
    forall u: word, a: char.
    but_last a u ++ Cons (last_char a u) Nil = Cons a u

  lemma first_last:
     forall a: char, u: word. exists v: word, b: char.
     v ++ Cons b Nil = Cons a u /\ length v = length u

  lemma key_lemma_right:
    forall w1 w'2: word, m: int, a: char.
    dist w1 w'2 m ->
    forall w2: word. w'2 = Cons a w2 ->
    exists u1 v1: word, k: int.
    w1 = u1 ++ v1 /\ dist v1 w2 k /\ k + length u1 <= m + 1

  lemma dist_symetry:
    forall w1 w2: word, n: int. dist w1 w2 n -> dist w2 w1 n

  lemma key_lemma_left:
    forall w1 w2: word, m: int, a: char.
    dist (Cons a w1) w2 m ->
    exists u2 v2: word, k: int.
    w2 = u2 ++ v2 /\ dist w1 v2 k /\ k + length u2 <= m + 1

  lemma dist_concat_left:
    forall u v w: word, n: int.
    dist v w n -> dist (u ++ v) w (length u + n)

  lemma dist_concat_right:
    forall u v w: word, n: int.
    dist v w n -> dist v (u ++ w) (length u + n)

  (* main lemmas *)

  lemma min_dist_equal:
    forall w1 w2: word, a: char, n: int.
     min_dist w1 w2 n -> min_dist (Cons a w1) (Cons a w2) n

  lemma min_dist_diff:
    forall w1 w2: word, a b: char, m p: int.
    a <> b ->
    min_dist (Cons a w1) w2 p ->
    min_dist w1 (Cons b w2) m ->
    min_dist (Cons a w1) (Cons b w2) (min m p + 1)

  lemma min_dist_eps:
    forall w: word, a: char, n: int.
    min_dist w Nil n -> min_dist (Cons a w) Nil (n + 1)

  lemma min_dist_eps_length:
    forall w: word. min_dist Nil w (length w)

end

module EditDistance

  use import Word
  use import list.Length as L
  use import ref.Ref
  use import array.Array

  (* Auxiliary definitions for the program and its specification. *)

  function suffix (a: array char) (i: int) : word

  axiom suffix_nil:
    forall a: array char. suffix a a.length = Nil

  axiom suffix_cons:
    forall a: array char, i: int.
    0 <= i < a.length -> suffix a i = Cons a[i] (suffix a (i+1))

  lemma suffix_length:
    forall a: array char, i: int.
    0 <= i <= a.length -> L.length (suffix a i) = (a.length - i)

  predicate min_suffix (a1 a2: array char) (i j n: int) =
    min_dist (suffix a1 i) (suffix a2 j) n

  function word_of (a: array char) : word = suffix a 0

  (* The program. *)

  let distance (w1: array char) (w2: array char)
    ensures  { min_dist (word_of w1) (word_of w2) result }
  = let n1 = length w1 in
    let n2 = length w2 in
    let t = Array.make (n2+1) 0 in
    (* initialization of t *)
    for i = 0 to n2 do
      invariant { forall j:int. 0 <= j < i -> t[j] = n2 - j }
      t[i] <- n2 - i
    done;
    (* loop over w1 *)
    for i = n1 - 1 downto 0 do
      invariant { forall j:int. 0 <= j <= n2 -> min_suffix w1 w2 (i+1) j t[j] }
      let oldt = ref t[n2] in
      t[n2] <- t[n2] + 1;
      (* loop over w2 *)
      for j = n2 - 1 downto 0 do
        invariant { forall k:int. j < k <= n2 -> min_suffix w1 w2 i k t[k] }
        invariant { forall k:int. 0 <= k <= j -> min_suffix w1 w2 (i+1) k t[k] }
        invariant { min_suffix w1 w2 (i+1) (j+1) !oldt }
        let temp = !oldt in
        oldt := t[j];
        if w1[i] = w2[j] then
          t[j] <- temp
        else
          t[j] <- (min t[j] t[j + 1]) + 1
      done
    done;
    t[0]

end

download ZIP archive