--- /dev/null
+(**************************************************************************)
+(* *)
+(* Menhir *)
+(* *)
+(* François Pottier, INRIA Rocquencourt *)
+(* Yann Régis-Gianas, PPS, Université Paris Diderot *)
+(* *)
+(* Copyright 2005-2008 Institut National de Recherche en Informatique *)
+(* et en Automatique. All rights reserved. This file is distributed *)
+(* under the terms of the GNU Library General Public License, with the *)
+(* special exception on linking described in file LICENSE. *)
+(* *)
+(**************************************************************************)
+
+(* This module compresses a two-dimensional table, where some values
+ are considered insignificant, via row displacement. *)
+
+(* This idea reportedly appears in Aho and Ullman's ``Principles
+ of Compiler Design'' (1977). It is evaluated in Tarjan and Yao's
+ ``Storing a Sparse Table'' (1979) and in Dencker, Dürre, and Heuft's
+ ``Optimization of Parser Tables for Portable Compilers'' (1984). *)
+
+(* A compressed table is represented as a pair of arrays. The
+ displacement array is an array of offsets into the data array. *)
+
+type 'a table =
+ int array * (* displacement *)
+ 'a array (* data *)
+
+(* In a natural version of this algorithm, displacements would be greater
+ than (or equal to) [-n]. However, in the particular setting of Menhir,
+ both arrays are intended to be compressed with [PackedIntArray], which
+ does not efficiently support negative numbers. For this reason, we are
+ careful not to produce negative displacements. *)
+
+(* In order to avoid producing negative displacements, we simply use the
+ least significant bit as the sign bit. This is implemented by [encode]
+ and [decode] below. *)
+
+(* One could also think, say, of adding [n] to every displacement, so as
+ to ensure that all displacements are nonnegative. This would work, but
+ would require [n] to be published, for use by the decoder. *)
+
+let encode (displacement : int) : int =
+ if displacement >= 0 then
+ displacement lsl 1
+ else
+ (-displacement) lsl 1 + 1
+
+let decode (displacement : int) : int =
+ if displacement land 1 = 0 then
+ displacement lsr 1
+ else
+ -(displacement lsr 1)
+
+(* It is reasonable to assume that, as matrices grow large, their
+ density becomes low, i.e., they have many insignificant entries.
+ As a result, it is important to work with a sparse data structure
+ for rows. We internally represent a row as a list of its
+ significant entries, where each entry is a pair of a [j] index and
+ an element. *)
+
+type 'a row =
+ (int * 'a) list
+
+(* [compress equal insignificant dummy m n t] turns the two-dimensional table
+ [t] into a compressed table. The parameter [equal] is equality of data
+ values. The parameter [wildcard] tells which data values are insignificant,
+ and can thus be overwritten with other values. The parameter [dummy] is
+ used to fill holes in the data array. [m] and [n] are the integer
+ dimensions of the table [t]. *)
+
+let compress
+ (equal : 'a -> 'a -> bool)
+ (insignificant : 'a -> bool)
+ (dummy : 'a)
+ (m : int) (n : int)
+ (t : 'a array array)
+ : 'a table =
+
+ (* Be defensive. *)
+
+ assert (Array.length t = m);
+ assert begin
+ for i = 0 to m - 1 do
+ assert (Array.length t.(i) = n)
+ done;
+ true
+ end;
+
+ (* This turns a row-as-array into a row-as-sparse-list. *)
+
+ let sparse (line : 'a array) : 'a row =
+
+ let rec loop (j : int) (row : 'a row) =
+ if j < 0 then
+ row
+ else
+ let x = line.(j) in
+ loop
+ (j - 1)
+ (if insignificant x then row else (j, x) :: row)
+ in
+
+ loop (n - 1) []
+
+ in
+
+ (* Define the rank of a row as its number of significant entries. *)
+
+ let rank (row : 'a row) : int =
+ List.length row
+ in
+
+ (* Construct a list of all rows, together with their index and rank. *)
+
+ let rows : (int * int * 'a row) list = (* index, rank, row *)
+ Array.to_list (
+ Array.mapi (fun i line ->
+ let row = sparse line in
+ i, rank row, row
+ ) t
+ )
+ in
+
+ (* Sort this list by decreasing rank. This does not have any impact
+ on correctness, but reportedly improves compression. The
+ intuitive idea is that rows with few significant elements are
+ easy to fit, so they should be inserted last, after the problem
+ has become quite constrained by fitting the heavier rows. This
+ heuristic is attributed to Ziegler. *)
+
+ let rows =
+ List.sort (fun (_, rank1, _) (_, rank2, _) ->
+ compare rank2 rank1
+ ) rows
+ in
+
+ (* Allocate a one-dimensional array of displacements. *)
+
+ let displacement : int array =
+ Array.make m 0
+ in
+
+ (* Allocate a one-dimensional, infinite array of values. Indices
+ into this array are written [k]. *)
+
+ let data : 'a InfiniteArray.t =
+ InfiniteArray.make dummy
+ in
+
+ (* Determine whether [row] fits at offset [k] within the current [data]
+ array, up to extension of this array. *)
+
+ (* Note that this check always succeeds when [k] equals the length of
+ the [data] array. Indeed, the loop is then skipped. This property
+ guarantees the termination of the recursive function [fit] below. *)
+
+ let fits k (row : 'a row) : bool =
+
+ let d = InfiniteArray.extent data in
+
+ let rec loop = function
+ | [] ->
+ true
+ | (j, x) :: row ->
+
+ (* [x] is a significant element. *)
+
+ (* By hypothesis, [k + j] is nonnegative. If it is greater than or
+ equal to the current length of the data array, stop -- the row
+ fits. *)
+
+ assert (k + j >= 0);
+
+ if k + j >= d then
+ true
+
+ (* We now know that [k + j] is within bounds of the data
+ array. Check whether it is compatible with the element [y] found
+ there. If it is, continue. If it isn't, stop -- the row does not
+ fit. *)
+
+ else
+ let y = InfiniteArray.get data (k + j) in
+ if insignificant y || equal x y then
+ loop row
+ else
+ false
+
+ in
+ loop row
+
+ in
+
+ (* Find the leftmost position where a row fits. *)
+
+ (* If the leftmost significant element in this row is at offset [j],
+ then we can hope to fit as far left as [-j] -- so this element
+ lands at offset [0] in the data array. *)
+
+ (* Note that displacements may be negative. This means that, for
+ insignificant elements, accesses to the data array could fail: they could
+ be out of bounds, either towards the left or towards the right. This is
+ not a problem, as long as [get] is invoked only at significant
+ elements. *)
+
+ let rec fit k row : int =
+ if fits k row then
+ k
+ else
+ fit (k + 1) row
+ in
+
+ let fit row =
+ match row with
+ | [] ->
+ 0 (* irrelevant *)
+ | (j, _) :: _ ->
+ fit (-j) row
+ in
+
+ (* Write [row] at (compatible) offset [k]. *)
+
+ let rec write k = function
+ | [] ->
+ ()
+ | (j, x) :: row ->
+ InfiniteArray.set data (k + j) x;
+ write k row
+ in
+
+ (* Iterate over the sorted list of rows. Fit and write each row at
+ the leftmost compatible offset. Update the displacement table. *)
+
+ let () =
+ List.iter (fun (i, _, row) ->
+ let k = fit row in (* if [row] has leading insignificant elements, then [k] can be negative *)
+ write k row;
+ displacement.(i) <- encode k
+ ) rows
+ in
+
+ (* Return the compressed tables. *)
+
+ displacement, InfiniteArray.domain data
+
+(* [get ct i j] returns the value found at indices [i] and [j] in the
+ compressed table [ct]. This function call is permitted only if the
+ value found at indices [i] and [j] in the original table is
+ significant -- otherwise, it could fail abruptly. *)
+
+(* Together, [compress] and [get] have the property that, if the value
+ found at indices [i] and [j] in an uncompressed table [t] is
+ significant, then [get (compress t) i j] is equal to that value. *)
+
+let get (displacement, data) i j =
+ assert (0 <= i && i < Array.length displacement);
+ let k = decode displacement.(i) in
+ assert (0 <= k + j && k + j < Array.length data);
+ (* failure of this assertion indicates an attempt to access an
+ insignificant element that happens to be mapped out of the bounds
+ of the [data] array. *)
+ data.(k + j)
+
+(* [getget] is a variant of [get] which only requires read access,
+ via accessors, to the two components of the table. *)
+
+let getget get_displacement get_data (displacement, data) i j =
+ let k = decode (get_displacement displacement i) in
+ get_data data (k + j)
+