/*
 * hash_str.c,v 2.1 1992/08/11 00:29:20 pete Exp
 * hash_str.c,v
 * Revision 2.1  1992/08/11  00:29:20  pete
 * Added proper RCS log and id entries.
 *
 */

#include	<stdio.h>

/*
 * From "Fast Hashing of Variable Length Text Strings" by Peter K. Pearson,
 * June 1990, Communications of the ACM, pp 677-680.  This implementation
 * uses the variation to achieve a 16 bit hash index instead of the 8 bit as
 * specified in the algorithm.
 *
 * The advantages of the algorithm are:
 *	- No restriction is placed on the length of the text string.
 *	- The length of the string need not be known before hand.
 *	- Very little arithmetic is performed on each character being
 *	  hashed.
 *	- Similar strings are not likely to collide
 *	- Minimal, perfect hashing functions can be built in this form.
 *
 * Applying the test case to /usr/lib/dict/words produced 6871 collisions
 * over 23739 entries vs. the more typical hash function (hash_simple) which
 * produced 14665 collions over 23739 entries.  On the other hand, smaller
 * lists of words (such as the passwd file with 136 entries), had similiar
 * number of collisions (~90 with a table size of 50).
 */

/*
 * The psuedo random permutation of integers from 0-255 (Table 1 in the
 * article.
 */

#define ARRAY_SIZE 256
static int table[ARRAY_SIZE] =
{
1, 87, 49, 12, 176, 178, 102, 166, 121, 193, 6, 84, 249, 230, 44, 163,
14, 197, 213, 181, 161, 85, 218, 80, 64, 239, 24, 226, 236, 142, 38, 200,
110, 177, 104, 103, 141, 253, 255, 50, 77, 101, 81, 18, 45, 96, 31, 222,
25, 107, 190, 70, 86, 237, 240, 34, 72, 242, 20,214, 244, 227, 149, 235,
97, 234, 57, 22, 60, 250, 82, 175, 208, 5, 127, 199, 111, 62, 135, 248,
174, 169, 211, 58, 66, 154, 106, 195, 245, 171, 17, 187, 182, 179, 0, 243,
132, 56, 148, 75, 128, 133, 158, 100, 130, 126, 91, 13, 153, 246, 216, 219,
119, 68, 223, 78, 83, 88, 201, 99, 122, 11, 92, 32, 136, 114, 52, 10,
138, 30, 48, 183, 156, 35, 61, 26, 143, 74, 251, 94, 129, 162, 63, 152,
170, 7, 115, 167, 241, 206, 3, 150, 55, 59, 151, 220, 90, 53, 23, 131,
125, 173, 15, 238, 79, 95, 89, 16, 105, 137, 225, 224, 217, 160, 37, 123,
118, 73, 2, 157, 46, 116, 9, 145, 134, 228, 207, 212, 202, 215, 69, 229,
27, 188, 67, 124, 168, 252, 42, 4, 29, 108, 21, 247, 19, 205, 39, 203,
233, 40, 186, 147, 198, 192, 155, 33, 164, 191, 98, 204, 165, 180, 117, 76,
140, 36, 210, 172, 41, 54, 159, 8, 185, 232, 113, 196, 231, 47, 146, 120,
51, 65, 28, 144, 254, 221, 93, 189, 194, 139, 112, 43, 71, 109, 184, 209
};

static int
calc_hash (str, offset)
char		*str;			/* string to calculate hash value */
int		offset;			/* used for more bits */
{
	int	value;

	if (!str)
		return 0;
	/*
	 * Determine the initial value (i.e. use the offset).  This is no
	 * different if the offset == 0.
	 */
	value = table[*str++ + offset];

	/*
	 * Now calculate the hash value
	 */
	while (*str)
	{
		value = table[value ^ *str++];
	}
	return value;
}

int
hash_str (str)
char		*str;
{
	return (calc_hash (str, 0) << 8) | calc_hash (str, 1);
}


#ifdef TEST
static int
hash_simple (str)
char		*str;
{
	int	value = 0;
	int	i;

	if (!str)
		return 0;
	for (i = 0; *str; i++)
	{
		value ^= (*str++ << ((i & 3)*8));
	}
	return value;
}

/*
 * Let's use /usr/lib/dict/words to verify how good this hash function
 * really is.
 */

main (argc, argv)
int		argc;
char		**argv;
{
	FILE		*fptr;
	char		*file = "/usr/lib/dict/words";
	int		index;		/* index of hash values */
	int		count;		/* number of words */
	int		use_trivial = 0; /* use simple hash function flag */
	int		collisions;	/* how many collisions */
	int		*results;	/* where they are distributed */
	char		buf[BUFSIZ];	/* buffer to read word into */
	int		i;
	int		table_size = 1 << 15;
	int		opt;
	extern char	*optarg;

	while ((opt = getopt (argc, argv, "f:s:t")) != EOF)
	{
		switch (opt)
		{
		case 'f':
			file = optarg;
			break;
		case 's':
			table_size = atoi (optarg);
			break;
		case 't':
			use_trivial = 1;
			break;
		}
	}

	if ((fptr = fopen (file, "r")) == NULL)
	{
		perror (file);
		exit (1);
	}

	results = (int *) malloc (sizeof (int) * table_size);
	for (i = 0; i < table_size; i++)
	{
		results[i] = 0;
	}

	count = 0;
	collisions = 0;
	while (fgets (buf, sizeof (buf), fptr) != NULL)
	{
		if ((count % 500 ) == 0)
		{
			printf (".");
			fflush (stdout);
		}
		++count;
		if (use_trivial)
			index = hash_simple (buf) % table_size;
		else
			index = hash_str (buf) % table_size;
		if (results[index])
		{
			++collisions;
		}
		++results[index];
	}

	printf ("\n%d entries, %d collisions\n", count, collisions);
}

/*
 * A simple function to check that the above array was typed in (yuch)
 * correctly.  The array is checked to make sure are values are between
 * 0, 255; there are no duplicates; there are only 256 entries; all 256
 * values are used.  If there are any errors, the table is printed out (in
 * the same format as the one on page 678.
 */
check_table ()
{
	int	i;
	int	array[ARRAY_SIZE];
	int	err = 0;
	
	/* initialize the array */

	for (i = 0; i < ARRAY_SIZE; i++)
		array[i] = -1;

	for (i = 0; i < ARRAY_SIZE; i++)
	{
		if (table[i] < 0 || table[i] >= ARRAY_SIZE)
		{
			++err;
			printf ("table[%d] = %d (out of bounds)\n", i, table[i]);
			continue;
		}
		if (array[table[i]] != -1)
		{
			++err;
			printf ("table[%d] == table[%d] == %d\n", i, array[i],
				table[i]);
		}
		array[table[i]] = i;
	}

	/*
	 * Make sure everything is used
	 */
	if (err == 0)
	{
		for (i = 0; i < ARRAY_SIZE; i++)
			if (array[i] == -1)
			{
				printf ("%d not in table\n", i);
				++err;
			}
	}

	/*
	 * Print out the table if there are any errors
	 */

	if (err)
	{
		for (i = 0; i < ARRAY_SIZE; i++)
		{
			printf ("%3d ", table[i]);
			if ((i + 1) % 16 == 0)
				printf ("\n");
		}
	}
	fprintf (stderr, "%d errors\n", err);
	exit (err == 0?0:1);
}
#endif /* TEST */
