(*************************************************************************)
(*                                                                       *)
(*                     Projet      Formel                                *)
(*                                                                       *)
(*                            CAML                                       *)
(*                                                                       *)
(*************************************************************************)
(*                                                                       *)
(*                            Inria                                      *)
(*                      Domaine de Voluceau                              *)
(*                      78150  Rocquencourt                              *)
(*                            France                                     *)
(*                                                                       *)
(*************************************************************************)

(* hash.ml	Some applications of the universal hashing function.     *)
(*		Xavier Leroy.						 *)

(*\
\chapter{Hashing}

\section{The universal hashing function}

\begin{caml_primitive}
hash_univ
hash_univ_param
\end{caml_primitive}

These functions associate with every CAML value a positive number
(a small integer, indeed), trying to map them as evenly as possible.
The average behaviour is quite good, but \verb|hash_univ| is only supposed
to verify the two following properties:

\begin{itemize}
\item   two equal values are guaranteed to have the same hashing values,
        i.e. \verb|x = y| implies \verb|hash_univ x = hash_univ y|

\item   \verb|hash_univ| always terminates, even on circular values
        (though the hashing may be somehow poor on such data)
\end{itemize}

Hashing is achieved by recursive descent on the representation
of the value, and combining hashing values for the leaves of this
 representation, i.e. strings, integers, floating point numbers,
 and atoms, as follows:

\begin{caml_example}
#fast arith true;;
let rec hash_obj obj =
  let cpt_meaningful = ref 10 and cpt_depth = ref 1000
  and hash_address x =
    match address_of_obj x with
        Addr_short n -> (num_of_int n)
      | Addr_long(n,n') -> (num_of_int n')
  in hash_rec 0 obj
  where rec hash_rec accu obj =
    if !cpt_depth<0 or !cpt_meaningful<0 then accu else
      match obj with
          <:obj< () >> ->
                accu
        | obj_int i ->
                decr cpt_meaningful; accu * 263 + Int i
        | obj_float f ->
                decr cpt_meaningful; accu * 263 + Float f
        | obj_string s ->
                decr cpt_meaningful;
                (hash_string accu (pred(length_string s))
                 where rec hash_string accu i =
                  if i<0 then accu else
                   hash_string
                    (accu * 263 + nth_ascii(i,s)) (pred i))
        | obj_cons(obj1,obj2) ->
                decr cpt_depth;
                hash_rec (hash_rec accu obj1) obj2
        | obj_vect V ->
                decr cpt_depth;
                (hash_vector accu (pred (vect_length V))
                 where rec hash_vector accu i =
                  if i<0 then accu else
                   hash_vector
                    (hash_rec accu (vect_item(V,i))) (pred i))
        | x ->  decr cpt_meaningful;
                accu * 263 + hash_address x
;;
let hash_univ = hash_obj o Repr
;;
\end{caml_example}
\begin{caml_eval}
#fast arith false;;
\end{caml_eval}
The actual function \verb|hash_univ| is written in machine language,
for the sake of efficiency.


\verb"hash_univ_param" is similar to \verb"hash_univ", but it uses two extra
\verb"num" arguments to tailor \verb"hash_univ" to a particular use. Namely:
\verb"hash_univ_param(n1,n2,O)" finds an hash code for the object \verb"O",
exploring \verb"O" until at least \verb"n2" significant leaves has been
seen, under the constraint that at most \verb"n1" recursive calls to the
hashing function can be made.

\verb"hash_univ O" is equivalent to \verb"hash_univ_param (1000,20,O)".
In case of structures sharing a big tree from their root, \verb"hash_univ"
may have bad results, finding the same hash code for a lot of different
objects. An example is hash coding all the sub-lists of a list which has
many instances of the same element
 (or even worse, only one repeated element). In this case it may help to tune
the hashing function using \verb"hash_univ_param" instead of \verb"hash_univ".

\section{Using dynamic hashing instead of lists}

A dynamic hashing table is a vector of lists, called {\em buckets}, holding a collection of elements, with the convention that all the elements of the bucket number $n$ hash to the value $n$ (modulo the length of the vector). 
Hence, adding, removing an item, and searching for a given item may be restricted to the bucket whose number is the hash key of the item.
Given a hashing table of size $m$, supposing a perfectly regular hashing, this allows storing $n$ elements with maximum retrieval time proportional to $n/m$, instead of $n$ in case of a linear list.

The choice of a size for the hash table is left to the user; if the amount of data is not too important, a $n/m$ ratio of 1 to 5 is sensible.
Also, it is well known that hashing usually works better when the table size is a prime number.

\begin{caml_primitive}
suggested_hash_table_size
\end{caml_primitive}

The following function is intended to help you choose the right size: giving it an estimated size, it will compute the first prime greater than it.
\*)

let suggested_hash_table_size n = 
  let is_prime n =
    let dmax = integer(sqrt n) in
      let rec is_prime_rec d =
        d>=dmax or ((n mod d <> 0) & (n mod (d+2) <> 0) & (is_prime_rec (d+6)))
      in (n mod 2 <> 0) & (n mod 3 <> 0) &  (is_prime_rec 5) in
  let rec hash_size_rec m =
    if is_prime m then m
    if is_prime (m+2) then m+2
    else hash_size_rec (m+6)
  in hash_size_rec (6*integer(n/6)+5)
;;    

(*\
Once chosen the size, creating an empty hash table should be achieved by the following primitive:

\begin{caml}

let hash_init n = vector n of [];;

\end{caml}

but the current typechecker cannot handle this function, because \verb|[]| is polymorphic and polymorphic vectors are always prohibited.
Hence the user have to allocate its table ``by hand'', constraining the empty list if necessary:

\begin{caml}

let my_hash_table = vector 1013 of ([] : (string & num) list);;

\end{caml}

\begin{caml_primitive}
hash_clear
hash_copy
\end{caml_primitive}

The function \verb|hash_clear| empties (physically) the given hash table;
\verb|hash_copy| copies the content of the first given hash table into the second one, throwing away its content.
\*)

let hash_clear v = modify_vect ((fun _ -> []),v)
and hash_copy v v' = do_vect_i (fun i b -> vect_assign(v',i,b)) v
;;

(* 1- Hashed sets   *)

(*\

\section{Hashed sets}

We give here some primitives to deal with sets represented by hashed tables.
Most operations --- especially, membership --- are much faster than with the usual list representation. But adding and removing elements is done via side effects on the table, making these primitives unsuitable for some applications.

\begin{caml_primitive}
hash_add_elem
hash_remove_elem
hash_mem
\end{caml_primitive}

The functions \verb|hash_add_elem| and \verb|hash_remove_elem| respectively add one element to the given hashed set (except if it was already here, i.e. we implement here true sets), and remove one element.
They work by side effects on the table, as mentioned above. The function \verb|mem_hash| is the exact counterpart of \verb|mem|, i.e. it tests membership.
\*)
let hash_add_elem e v =
  let i = (hash_univ e) mod (vect_length v) in
    if not (mem e (vect_item(v,i))) then vect_assign(v,i,e::vect_item(v,i));
    e

and hash_remove_elem e v =
  let i = (hash_univ e) mod (vect_length v) in
    vect_assign(v,i,except e (vect_item(v,i))); ()

and hash_mem e v =
  match vect_item(v,(hash_univ e) mod (vect_length v)) with
      []      -> false
    | [e1]    -> e=e1
    | [e1;e2] -> e=e1 or e=e2
    | L       -> mem e L
;;
(*\

\par\noindent
\begin{bf}
Beware:
\end{bf}
\par\noindent
\verb"--" Physical modification of values stored in hashed sets is not advisable, since the hash key for a modified element will change also, but the element will stay in the same bucket, hence retrieval of that element will usually fail. For instance:

\begin{caml_example}
let v = vector 10 of ([] : string list) and s = "foo";;
hash_add_elem s v;;
hash_mem s v;;
replace_string s "a" 1;;
hash_mem s v;;
\end{caml_example}

\begin{caml_primitive}
hash_union
hash_intersect
hash_subtract
\end{caml_primitive}

These are the usual operations between two sets, where the result is physically put into the secod one. These primitives operate bucket by bucket, hence they don't work on tables of different sizes.
\*)

let hash_union, hash_intersect, hash_subtract =
  let hash_oper oper v v' =
    if vect_length v = vect_length v'
    then do_vect_i (fun i b' -> vect_assign(v',i,oper (vect_item(v,i)) b')) v'
    else failwith "set operation : different lengths"
  in hash_oper union, hash_oper intersect, hash_oper subtract
;;

(*\
\begin{caml_primitive}
hash_merge
hash_resize
\end{caml_primitive}

The function \verb|hash_merge| also perform union of two hashed sets, but in a different way, so as to handles as well tables of different sizes.
This gives a way to copy the content of a hashed set into another one of different size, as showed by \verb|hash_resize|.
However, these functions are slower than \verb|hash_copy| and \verb|hash_union|, because all the elements have to be hashed again.
\*)

let hash_merge v v' =
  do_vect (map (fun e -> hash_add_elem e v')) v
;;

let hash_resize v v' = hash_clear v'; hash_merge v v'
;;


(* 2- Hashed graphs (hashed association lists) *)

(*\

\section{Hashed association lists}

Hashing allows efficient implementation of association lists, i.e. lists of pairs associating a value to a key.

\begin{caml_primitive}
hash_add_point
hash_remove_point
hash_assoc
hash_mem_assoc
\end{caml_primitive}

Adding a pair of a key and a value to the hashed A-list, i.e. binding a key to a value, is achieved through \verb|hash_add_point|; removing of such a binding, given its key, is done by \verb|hash_remove_point|.
The function \verb|hash_assoc| retrieves the value associated with the given key; \verb|hash_mem_assoc| indicates whether the given key is bound or not.

Binding a key to different values, i.e. adding several pairs with the same key and different values, is allowed, though these primitives always use the most recent binding, i.e. \verb|hash_assoc| will always return the value associated last to the key, and \verb|hash_remove_point| will always throw away the last binding of a key, therefore ``popping'' the previous one.

\*)

let hash_add_point (key,val as pair) v =
  let i = (hash_univ key) mod (vect_length v) in
    vect_assign(v,i,pair::vect_item(v,i));
    pair

and hash_remove_point key v =
  let i = (hash_univ key) mod (vect_length v) in
    vect_assign(v,i,except_assoc key (vect_item(v,i))); ()

and hash_mem_assoc key v =
  match vect_item(v,(hash_univ key) mod (vect_length v)) with
      []              -> false
    | [key1,_]        -> key=key1
    | [key1,_;key2,_] -> key=key1 or key=key2
    | L               -> mem_assoc key L

and hash_assoc key v =
  match vect_item(v,(hash_univ key) mod (vect_length v)) with
      []                    -> failwith "find"
    | [key1,val1]           -> if key=key1 then val1 else failwith "find"
    | [key1,val1;key2,val2] -> if key=key1 then val1
                               if key=key2 then val2
                               else failwith "find"
    | L                     -> assoc key L

;;

(* 3- Sharing *)

(*\

\section{Using hashing to build shared structures}

When using very large data structures in which the same datas occur several times, i.e. where many parts of the structure are equal, trying to share these datas, i.e. having but one physical representation of the data in memory and many pointers to it, might prove dramatically space-saving.

Such a sharing is conveniently achieved by keeping a list of all values we have used so far, and after building a new value, searching the list for a previous value which is equal to the new value, and later always using the ``old'' value thus found; that way, the value we built won't be used anymore and will be garbage collected soon.

Such a search is usually time-consuming; efficient hashing allows keeping this time affordable. Don't forget that cluttered memory means frequent time-wasting garbage collection, hence sharing may, in the end, speed up some programs.

\begin{caml_primitive}
hash_repr
hash_find_repr
\end{caml_primitive}

Given an (initially empty) table and a value, the function \verb|hash_find| returns the {\em representative} for it, i.e. it gives a value equal to the starting value (\verb|hash_repr v table = v| is always true), and when called several times with the same value, it returns the same physical object (i.e. \verb|v = v'| implies that \verb|eq(hash_repr v table, hash_repr v' table)| is true).

Using \verb|hash_repr| is straightforward: after initializing a hash table, each time you create, say, a string, and you are about to put it into some relatively permanent data structure (one with a long lifespan, such as a symbol table, for instance), call \verb|hash_repr| on it and use the result instead.

The function \verb|hash_find_repr| is similar, but fails when the given value is not already in the table. It is sometimes useful, as the following example shows.
\*)
let hash_find_repr e v = 
  match vect_item(v,(hash_univ e) mod (vect_length v)) with
      [] -> failwith "find"
    | [e1] -> if e=e1 then e1 else failwith "find"
    | [e1;e2] -> if e=e1 then e1 if e=e2 then e2 else failwith "find"
    | L -> find_repr L where rec find_repr =
            function  []  -> failwith "find"
                 |  e'::L -> if e=e' then e' else find_repr L
;;

let hash_repr e v =
  hash_find_repr e v  ?  hash_add_elem e v
;;

(*\

\subsection*{Advanced example: sharing substructures}

Suppose you are using a term-like data structure, such as, for instance:

\begin{caml_example}
type term = Leaf of string  |  Node of string & term list;;
\end{caml_example}

Maximal sharing, i.e. sharing of common sub-terms, may then be achieved by the following \verb|share_term| function:

\begin{caml_example}
let rec share_term =
 let sharing_table = vector 263 of [] in
  function Leaf s as term -> hash_repr term sharing_table
        |  Node(s,L) as term ->
           try
           (* The whole term is already in the table *)
           hash_find_repr term sharing_table
           with failure "find" ->
           (* Otherwise: we find recursively representatives
              for the elements of L and we build a new Node
              with these sons; this new Node is the one we
              put in the hash table and return *)
           hash_add_elem (Node(s,map share_term L)) sharing_table
;;
\end{caml_example}

This is a tried-and-true technique: in the CAML system itself,
 the types of global values are represented by a term-like structure;
 sharing them as shown led to a space saving of no less than 120 Kbytes \ldots
\*)

