/* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal.  All Rights Reserved. */
/* ./glimpse/index/filetype.c */
/* --------------------------------------------------------------------------
   this function detect whether a given file is of special type
   which we do not want to index.
   if so, then return(1) else return (0).
   a file is said to be binary if more than 10% of character > 128
   in the sampled input.
   a file is a uuencoded file if (maybe after mail header), there is
   a "begin" followed by 3 digits, and no lower case character.

   statistics we are concerned of:
   1) average word length: should not be greater than 10.
   2) index density: (the number of different words v.s. number of words).

-----------------------------------------------------------------------------*/
#include "glimpse.h"
#define SAMPLE_SIZE  8192
#define WORD_THRESHOLD  18  /* the ratio between number of characters and
		delimiters (blanks or \n) above which the file is determined to be
		hqx or other non-natural language text */

#if	BG_DEBUG
extern	FILE	*LOGFILE;
#endif	/*BG_DEBUG*/
char *member[MAX_4K_HASH];
int member_tag[MAX_4K_HASH];
int  file_id;
extern  char *getword();
extern char INDEX_DIR[MAX_LINE_LEN];

/*
 * dosuffix > 0 => processes suffixes (build_in.c after filtering);
 * dosuffix > 0 but != 1 => processes suffixes only (IndexEverything, dir.c where we don't want to read files);
 * dosuffix == 0 => processes other ad-hoc file checks (Default, dir.c where we want to discard un-indexable files).
 */
int
filetype(name, dosuffix)
char *name;
int dosuffix;
{
	unsigned char buffer[SAMPLE_SIZE+1];
	int num_read;
        int BINARY=0;
        int UUENCODED=0;
	int fd;

	if (!dosuffix) goto nosuffix;
	if (!strcmp(COMP_SUFFIX, &name[strlen(name)-strlen(COMP_SUFFIX)]))
		return 0;
	if (test_special_suffix(name)) {
#if	BG_DEBUG
		fprintf(LOGFILE, "special suffix: %s -- not indexing\n", name);
#endif	/*BG_DEBUG*/
		return 1;
	}
	if (dosuffix != 1) return 0;

nosuffix:
	if((fd = open(name, 0)) < 0) {
		/* This is the only thing the user might want to know: suppress other warnings */
		fprintf(stderr, "permission denied or non-existent file: %s\n", name);
		return(1);
	}
        if ((num_read = read(fd, buffer, SAMPLE_SIZE)) <= 0) {
#if	BG_DEBUG
		fprintf(LOGFILE, "no data: %s -- not indexing\n", name);
#endif	/*BG_DEBUG*/
		close(fd);
		return 1;
	}

	if (test_postscript(buffer, num_read)) {
#if	BG_DEBUG
		fprintf(LOGFILE, "postscript file: %s -- not indexing\n", name);
#endif	/*BG_DEBUG*/
		close(fd);
		return 1;
	}

        BINARY = test_binary(buffer, num_read);
        if(BINARY == ON) {
#if	BG_DEBUG
		fprintf(LOGFILE, "binary file: %s -- not indexing\n", name);
#endif	/*BG_DEBUG*/
		close(fd);
		return(1);
	}

	/* now check for uuencoded file */
        UUENCODED = test_uuencode(buffer, num_read);
        if(UUENCODED == ON) {
#if	BG_DEBUG
		fprintf(LOGFILE, "uuencoded file: %s -- not indexing\n", name);
#endif	/*BG_DEBUG*/
		close(fd);
		return(1);
	}
	if(heavy_index(name, buffer, num_read)) { 
#if	BG_DEBUG
		fprintf(LOGFILE, "heavy index file: %s -- not indexing\n ", name);
#endif	/*BG_DEBUG*/
		close(fd);
		return(1);
	}
	if(hqx(name, buffer, num_read)) { 
#if	BG_DEBUG
		fprintf(LOGFILE, "too few real words: %s -- not indexing\n", name);
#endif	/*BG_DEBUG*/
		close(fd);
		return(1);
	}
	close(fd);
	return(0);
}

/* ----------------------------------------------------------------------
   check for heavy index file.
   the function first test block 1 (of SAMPLE_SIZE bytes).
   the file is determined to be heavy index file if
   index_ratio > 0.9 and num_words > 500
   ???
---------------------------------------------------------------------- */
heavy_index(name, buffer, num_read)
char *name;
char *buffer;
int num_read;
{
	char *buffer_end;
	int hash_value;
	int new_word_num=0;
	int word_num=0;
	char word[256];

	buffer_end = &buffer[num_read];
	while((buffer = getword(name, word, buffer, buffer_end, NULL)) < buffer_end) {
		if(word[0] == '\0') continue;
		word_num++;
		hash_value = hash4k(word, strlen(word));
		if(member_tag[hash_value] != file_id) {
			new_word_num++;
			member_tag[hash_value] = file_id;
		}
	}
	if(new_word_num * 100 >= word_num * 83 && word_num >= 500) return(1);
#ifdef debug
	printf("%s: new_word_num=%d, word_num=%d\n", name, new_word_num, word_num);
#endif
	return(0);
}

/* ----------------------------------------------------------------------
   check for hqx encoded files or other files with long lines,
   for example, postscript files, core files, and others.
   the function first test block 1 (of SAMPLE_SIZE bytes).
   the file is determined to be bad if the ratio of blanks or newlines
   is too small.
---------------------------------------------------------------------- */

hqx(name, buffer, num_read)
char *name;
char *buffer;
int num_read;
{
int i;
char c;
int sep=0;
	if (num_read < 2048) return(0) ;
	for (i=0; i < num_read ; i++) {
		c=buffer[i];
		if (c == '\n' || c == ' ' || c == '/') sep++;
	/* the '/' is for list of file names. */
	/* the \n is for lists of words, but should be excluded really so
		that dictionaries are excluded */
	}
	if (!sep) return(1);
	if (num_read/sep > WORD_THRESHOLD) return(1);
		else return(0);
} 
