static char rcsid[] = "recognize.c,v 1.34 1996/01/05 20:28:58 duane Exp";
/*
 *  recognize.c - Type recognition for the Essence system.
 *
 *  DEBUG: section  63, level 1         Gatherer essence type recognition
 *
 *  Darren Hardy, hardy@cs.colorado.edu, February 1994
 *
 *  ----------------------------------------------------------------------
 *  Copyright (c) 1994, 1995.  All rights reserved.
 *  
 *    The Harvest software was developed by the Internet Research Task
 *    Force Research Group on Resource Discovery (IRTF-RD):
 *  
 *          Mic Bowman of Transarc Corporation.
 *          Peter Danzig of the University of Southern California.
 *          Darren R. Hardy of the University of Colorado at Boulder.
 *          Udi Manber of the University of Arizona.
 *          Michael F. Schwartz of the University of Colorado at Boulder.
 *          Duane Wessels of the University of Colorado at Boulder.
 *  
 *    This copyright notice applies to software in the Harvest
 *    ``src/'' directory only.  Users should consult the individual
 *    copyright notices in the ``components/'' subdirectories for
 *    copyright information about other software bundled with the
 *    Harvest source code distribution.
 *  
 *  TERMS OF USE
 *    
 *    The Harvest software may be used and re-distributed without
 *    charge, provided that the software origin and research team are
 *    cited in any use of the system.  Most commonly this is
 *    accomplished by including a link to the Harvest Home Page
 *    (http://harvest.cs.colorado.edu/) from the query page of any
 *    Broker you deploy, as well as in the query result pages.  These
 *    links are generated automatically by the standard Broker
 *    software distribution.
 *    
 *    The Harvest software is provided ``as is'', without express or
 *    implied warranty, and with no support nor obligation to assist
 *    in its use, correction, modification or enhancement.  We assume
 *    no liability with respect to the infringement of copyrights,
 *    trade secrets, or any patents, and are not responsible for
 *    consequential damages.  Proper use of the Harvest software is
 *    entirely the responsibility of the user.
 *  
 *  DERIVATIVE WORKS
 *  
 *    Users may make derivative works from the Harvest software, subject 
 *    to the following constraints:
 *  
 *      - You must include the above copyright notice and these 
 *        accompanying paragraphs in all forms of derivative works, 
 *        and any documentation and other materials related to such 
 *        distribution and use acknowledge that the software was 
 *        developed at the above institutions.
 *  
 *      - You must notify IRTF-RD regarding your distribution of 
 *        the derivative work.
 *  
 *      - You must clearly notify users that your are distributing 
 *        a modified version and not the original Harvest software.
 *  
 *      - Any derivative product is also subject to these copyright 
 *        and use restrictions.
 *  
 *    Note that the Harvest software is NOT in the public domain.  We
 *    retain copyright, as specified above.
 *  
 *  HISTORY OF FREE SOFTWARE STATUS
 *  
 *    Originally we required sites to license the software in cases
 *    where they were going to build commercial products/services
 *    around Harvest.  In June 1995 we changed this policy.  We now
 *    allow people to use the core Harvest software (the code found in
 *    the Harvest ``src/'' directory) for free.  We made this change
 *    in the interest of encouraging the widest possible deployment of
 *    the technology.  The Harvest software is really a reference
 *    implementation of a set of protocols and formats, some of which
 *    we intend to standardize.  We encourage commercial
 *    re-implementations of code complying to this set of standards.  
 *  
 */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <memory.h>
#include <sys/types.h>
#include <sys/param.h>
#include <ctype.h>
#include "url.h"
#include "util.h"
#include "essence.h"

/* Local data structures */
struct type_regex {		/* maps a type name into a regex pattern */
	char *type;
	char *pattern;
#if defined(USE_GNU_REGEX) || defined(USE_POSIX_REGEX)
	regex_t compiled_pattern;	/* can reuse compiled patterns */
#endif
};
static struct type_regex *types_by_name = NULL;
static struct type_regex *types_by_content = NULL;
static struct type_regex *types_by_url = NULL;
static int ntypes_by_name = 0;
static int ntypes_by_content = 0;
static int ntypes_by_url = 0;

/* Local Functions */
static int init_types();

/* Local macros */
#if defined(USE_GNU_REGEX)
#define type_match(s, tr) \
	(re_match(&(tr).compiled_pattern, (s), strlen(s), 0, 0) > 0)

#elif defined(USE_POSIX_REGEX)
#define type_match(s, tr) \
	(regexec(&(tr).compiled_pattern, (s), 0, 0, 0) == 0)

#elif defined(USE_BSD_REGEX)
#define type_match(s, tr) \
	((re_comp((tr).pattern) == NULL) ? (re_exec(s) > 0) : 0)
#endif

/*
 *  type_recognize() - Recognizes the type of the given DataObject 
 *  and saves the type information in object->type.  Returns non-zero 
 *  on error; and 0 on success.
 */
int type_recognize(object)
     DataObject *object;
{
	int done = 0;

	Debug(63, 1, ("type_recognize(%s)\n", object->url->url));
	switch (object->url->type) {
	case URL_FILE:
		done = !type_recognize_by_stat(object) ||
		    !type_recognize_by_url(object) ||
		    !type_recognize_by_name(object) ||
		    ((object->flags & F_NO_ACCESS) == 0 &&
		    !type_recognize_by_content(object));
		break;
	case URL_HTTP:
	case URL_FTP:
	case URL_GOPHER:
	case URL_NOP:
	case URL_X:
	case URL_NEWS:
		done = !type_recognize_by_url(object) ||
		    !type_recognize_by_name(object) ||
		    ((object->flags & F_NO_ACCESS) == 0 &&
		    !type_recognize_by_content(object));
		break;
	default:
		errorlog("Unsupported URL: recognize: %s.\n", object->url->url);
	}

	/* Try external typing */
	if (!done)
		done = !type_recognize_by_external(object);

	/* Default type */
	if (!done)
		object->type = strdup("Unknown");

	/* Clean up */
	if (object->data != NULL) {
		xfree(object->data);
		object->data = NULL;
		object->dsize = 0;
	}
	return (0);
}

/*
 *  type_recognize_by_name() - Recognizes the type of a given DataObject
 *  using only file naming conventions.  Returns non-zero on error; and 0
 *  on success.
 */
int type_recognize_by_name(object)
     DataObject *object;
{
	int i;

	for (i = 0; i < ntypes_by_name; i++) {
		if (type_match(object->basename, types_by_name[i])) {
			object->type = strdup(types_by_name[i].type);
			return (0);
		}
	}
	return (1);
}

/*
 *  type_recognize_by_url() - Recognizes the type of a given DataObject
 *  using only URL naming conventions.  Returns non-zero on error; and 0
 *  on success.
 */
int type_recognize_by_url(object)
     DataObject *object;
{
	int i;

	for (i = 0; i < ntypes_by_url; i++) {
		if (type_match(object->url->url, types_by_url[i])) {
			object->type = strdup(types_by_url[i].type);
			return (0);
		}
	}
	return (1);
}

/*
 *  type_recognize_by_content() - Recognizes the type of a given DataObject
 *  by recognizing identifying data within the object.  Returns non-zero 
 *  on error; and 0 on success.
 */
int type_recognize_by_content(object)
     DataObject *object;
{
	char *s, *rawtype = NULL;
	int i, sz;
	extern char *softmagic(), *ascmagic();

	if (object_retrieve(object))
		return (1);

	/* First read the first bytes of the file, if not already there */
	if (object->data == NULL) {
		if (object->s->st_size == 0) {
			object->dsize = 0;
			object->data = NULL;
			object->type = strdup("Empty");
			return (0);
		}
		/* Check to see if we can easily read the entire file */
		sz = (object->s->st_size < (MIN_XFER * 4)) ?
		    object->s->st_size : MIN_XFER;

		/* Need some extra room because file() isn't nice */
		object->data = xmalloc(sz + 32);
		memset(object->data, '\0', sz + 32);
		object->dsize = url_read(object->data, sz, 0, object->url);
		Debug(63, 1, ("Reading %d bytes (got %d) from %s\n", sz,
			object->dsize, object->url->url));
		if (object->dsize <= 0) {
			xfree(object->data);
			object->data = NULL;
			object->dsize = 0;
			return (1);
		}
	}
	/* Now use the routines from file(1) to identify contents */
	if ((s = softmagic(object->data)) != NULL)
		rawtype = strdup(s);
	else if ((s = ascmagic(object->data, object->dsize)) != NULL)
		rawtype = strdup(s);
	else
		return (1);	/* still unknown */


	/* Match the output of file(1) with our regular expressions */
	for (i = 0; i < ntypes_by_content; i++) {
		if (type_match(rawtype, types_by_content[i])) {
			object->type = strdup(types_by_content[i].type);
			xfree(rawtype);
			return (0);
		}
	}
	xfree(rawtype);
	return (1);		/* still unknown */
}

/*
 *  type_recognize_by_stat() - Recognizes the type of a given DataObject
 *  by looking at the stat(2) data for the object.  Returns non-zero 
 *  on error; and 0 on success.
 */
int type_recognize_by_stat(object)
     DataObject *object;
{
	if (S_ISDIR(object->s->st_mode)) {
		object->type = strdup("Directory");
		return (0);
	}
#ifdef S_ISLNK
	if (S_ISLNK(object->s->st_mode)) {
		object->type = strdup("SymbolicLink");
		return (0);
	}
#endif
#ifdef S_ISSOCK
	if (S_ISSOCK(object->s->st_mode)) {
		object->type = strdup("Socket");
		return (0);
	}
#endif
	if (!S_ISREG(object->s->st_mode)) {	/* bizarre file */
		object->type = strdup("Unknown");
		return (0);
	}
	return (1);
}

/*
 *  type_recognize_by_external() - Recognizes the type of a given DataObject
 *  using external, user-defined programs.  Returns non-zero on error; and 0
 *  on success.
 */
int type_recognize_by_external(object)
     DataObject *object;
{
	return (1);		/* stub */
}

/* Initialization routines */

/*
 *  init_type_recognize() - Initialize type recognition step.
 */
int init_type_recognize(cfile_by_name, cfile_by_content, cfile_by_url, magic_file)
     char *cfile_by_name;
     char *cfile_by_content;
     char *cfile_by_url;
     char *magic_file;
{
	int apprentice(), init_types();

	/* Read in magic file for file(1) routines */
	Debug(63, 1, ("Using %s as magic file.\n", magic_file));
	if (apprentice(magic_file, 0)) {
		log_errno(magic_file);
		return (1);
	}
	/* Read in by-name and by-content regular expressions */
	Debug(63, 1, ("Using %s as by-name configuration file.\n", cfile_by_name));
	Debug(63, 1, ("Using %s as by-content configuration file.\n", cfile_by_content));
	Debug(63, 1, ("Using %s as by-url configuration file.\n", cfile_by_url));
	types_by_name = xmalloc(MAX_TYPES * sizeof(struct type_regex));
	types_by_content = xmalloc(MAX_TYPES * sizeof(struct type_regex));
	types_by_url = xmalloc(MAX_TYPES * sizeof(struct type_regex));
	if (init_types(cfile_by_name, types_by_name, &ntypes_by_name) ||
	    init_types(cfile_by_url, types_by_url, &ntypes_by_url) ||
	    init_types(cfile_by_content, types_by_content, &ntypes_by_content)) {
		xfree(types_by_name);
		xfree(types_by_url);
		xfree(types_by_content);
		return (1);
	}
	if (debug_ok(63, 1)) {
		int i;
		for (i = 0; i < ntypes_by_content; i++)
			Log("Type: %s\tRE: %s\n", types_by_content[i].type,
			    types_by_content[i].pattern);
		for (i = 0; i < ntypes_by_name; i++)
			Log("Type: %s\tRE: %s\n", types_by_name[i].type,
			    types_by_name[i].pattern);
		for (i = 0; i < ntypes_by_url; i++)
			Log("Type: %s\tRE: %s\n", types_by_url[i].type,
			    types_by_url[i].pattern);
	}
	return (0);
}


/*
 *  finish_type_recognize() - Cleans up after the type recognition step.
 */
void finish_type_recognize()
{
	int i;

	for (i = 0; i < ntypes_by_name; i++) {
		if (types_by_name[i].type != NULL)
			xfree(types_by_name[i].type);
		if (types_by_name[i].pattern != NULL)
			xfree(types_by_name[i].pattern);
#if defined(USE_POSIX_REGEX)
		regfree(&types_by_name[i].compiled_pattern);
#endif
	}
	if (types_by_name != NULL)
		xfree(types_by_name);
	types_by_name = NULL;
	ntypes_by_name = 0;

	for (i = 0; i < ntypes_by_content; i++) {
		if (types_by_content[i].type != NULL)
			xfree(types_by_content[i].type);
		if (types_by_content[i].pattern != NULL)
			xfree(types_by_content[i].pattern);
#if defined(USE_POSIX_REGEX)
		regfree(&types_by_content[i].compiled_pattern);
#endif
	}
	if (types_by_content != NULL)
		xfree(types_by_content);
	types_by_content = NULL;
	ntypes_by_content = 0;

	for (i = 0; i < ntypes_by_url; i++) {
		if (types_by_url[i].type != NULL)
			xfree(types_by_url[i].type);
		if (types_by_url[i].pattern != NULL)
			xfree(types_by_url[i].pattern);
#if defined(USE_POSIX_REGEX)
		regfree(&types_by_url[i].compiled_pattern);
#endif
	}
	if (types_by_url != NULL)
		xfree(types_by_url);
	types_by_url = NULL;
	ntypes_by_url = 0;
}

/*
 *  init_types() - Initializes the given type_regex array with the regular
 *  expressions from filename.  Returns 0 on success; non-zero otherwise.
 */
static int init_types(filename, t, nt)
     char *filename;
     struct type_regex *t;
     int *nt;
{
	FILE *fp;
	char buf[BUFSIZ], *s;
	int ret;

	if ((fp = fopen(filename, "r")) == NULL) {
		log_errno(filename);
		return (1);
	}
	while (fgets(buf, BUFSIZ, fp) != NULL) {
		if (buf[0] == '#' || buf[0] == '\n')
			continue;
		if ((s = strrchr(buf, '\n')) != NULL)
			*s = '\0';
		for (s = &buf[0]; !isspace(*s); s++);
		*s = '\0';
		t[*nt].type = strdup(buf);

		for (++s; isspace(*s); s++);
		t[*nt].pattern = strdup(s);

#if defined(USE_GNU_REGEX)
		re_syntax_options = USE_RE_SYNTAX;
		ret = !(re_compile_pattern(t[*nt].pattern,
			(int) strlen(t[*nt].pattern),
			&t[*nt].compiled_pattern) == NULL);
#elif defined(USE_POSIX_REGEX)
		ret = regcomp(&t[*nt].compiled_pattern, t[*nt].pattern,
		    USE_RE_SYNTAX);
#else
		ret = !(re_comp(t[*nt].pattern) == NULL);	/* test it */
#endif
		if (ret != 0) {
			fatal("Couldn't compile %s", t[*nt].pattern);
		}
		if (++(*nt) >= MAX_TYPES) {
			errorlog("WARNING!: %s has too many types.\n", filename);
			break;
		}
	}
	fclose(fp);
	return (0);
}
