static char rcsid[] = "mkwordlist.c,v 1.18 1996/01/05 20:28:55 duane Exp";
/*
 *  mkwordlist - Generates a lists of unique words from the file
 *
 *  Darren Hardy, hardy@cs.colorado.edu, September 1994
 *
 *  ----------------------------------------------------------------------
 *  Copyright (c) 1994, 1995.  All rights reserved.
 *  
 *    The Harvest software was developed by the Internet Research Task
 *    Force Research Group on Resource Discovery (IRTF-RD):
 *  
 *          Mic Bowman of Transarc Corporation.
 *          Peter Danzig of the University of Southern California.
 *          Darren R. Hardy of the University of Colorado at Boulder.
 *          Udi Manber of the University of Arizona.
 *          Michael F. Schwartz of the University of Colorado at Boulder.
 *          Duane Wessels of the University of Colorado at Boulder.
 *  
 *    This copyright notice applies to software in the Harvest
 *    ``src/'' directory only.  Users should consult the individual
 *    copyright notices in the ``components/'' subdirectories for
 *    copyright information about other software bundled with the
 *    Harvest source code distribution.
 *  
 *  TERMS OF USE
 *    
 *    The Harvest software may be used and re-distributed without
 *    charge, provided that the software origin and research team are
 *    cited in any use of the system.  Most commonly this is
 *    accomplished by including a link to the Harvest Home Page
 *    (http://harvest.cs.colorado.edu/) from the query page of any
 *    Broker you deploy, as well as in the query result pages.  These
 *    links are generated automatically by the standard Broker
 *    software distribution.
 *    
 *    The Harvest software is provided ``as is'', without express or
 *    implied warranty, and with no support nor obligation to assist
 *    in its use, correction, modification or enhancement.  We assume
 *    no liability with respect to the infringement of copyrights,
 *    trade secrets, or any patents, and are not responsible for
 *    consequential damages.  Proper use of the Harvest software is
 *    entirely the responsibility of the user.
 *  
 *  DERIVATIVE WORKS
 *  
 *    Users may make derivative works from the Harvest software, subject 
 *    to the following constraints:
 *  
 *      - You must include the above copyright notice and these 
 *        accompanying paragraphs in all forms of derivative works, 
 *        and any documentation and other materials related to such 
 *        distribution and use acknowledge that the software was 
 *        developed at the above institutions.
 *  
 *      - You must notify IRTF-RD regarding your distribution of 
 *        the derivative work.
 *  
 *      - You must clearly notify users that your are distributing 
 *        a modified version and not the original Harvest software.
 *  
 *      - Any derivative product is also subject to these copyright 
 *        and use restrictions.
 *  
 *    Note that the Harvest software is NOT in the public domain.  We
 *    retain copyright, as specified above.
 *  
 *  HISTORY OF FREE SOFTWARE STATUS
 *  
 *    Originally we required sites to license the software in cases
 *    where they were going to build commercial products/services
 *    around Harvest.  In June 1995 we changed this policy.  We now
 *    allow people to use the core Harvest software (the code found in
 *    the Harvest ``src/'' directory) for free.  We made this change
 *    in the interest of encouraging the widest possible deployment of
 *    the technology.  The Harvest software is really a reference
 *    implementation of a set of protocols and formats, some of which
 *    we intend to standardize.  We encourage commercial
 *    re-implementations of code complying to this set of standards.  
 *  
 */
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include "util.h"
#include "essence.h"

/*
 *  mkwordlist() - Generates a list of unique words from the input string s
 *  or length sz bytes.  Returns the list of words as a single string
 *  with the words separated by newlines.  Returns NULL on error.
 */
char *mkwordlist(s, sz)
     char *s;
     int sz;
{
	static char *result;
	char buf[BUFSIZ], *tmp, *tmpfile;
	int i, wordsz, notascii = 0;
	struct stat sb;
	FILE *fp;

	if (s == NULL || sz < 3)
		return NULL;
	/*
	 *  Abort if the input buffer is binary data.  Binary data is
	 *  3 non-ASCII characters in a row.
	 */
	for (i = 0; i < sz; i++) {
		if (!isascii((unsigned char) s[i]))
			notascii++;
		if (notascii > 2)
			return (NULL);
	}

	/* Grab a temporary filename */
	if ((tmpfile = tempnam(NULL, "wdlst")) == NULL) {
		log_errno("tempnam");
		return NULL;
	}
	/* 
	 *  Make a copy of the input buffer;
	 *  Convert to upper case to lower case,
	 *  and convert punctuation, numbers, etc. to \n 
	 */
	tmp = xmalloc(sz + 1);
	memcpy(tmp, s, sz);
	tmp[sz] = '\0';
	for (i = 0; i < sz; i++) {
		if (isalpha((unsigned char) tmp[i])) {
			if (isupper((unsigned char) tmp[i]))
				tmp[i] = tolower((unsigned char) tmp[i]);
		} else {
			tmp[i] = '\n';
		}
	}

	/* Remove the tmpfile (if exists) and sort/uniq the word list */
	if (access(tmpfile, F_OK) == 0) {
		if (unlink(tmpfile) < 0) {
			log_errno(tmpfile);
			xfree(tmpfile);
			xfree(tmp);
			return (NULL);
		}
	}
	sprintf(buf, "sort | uniq > %s", tmpfile);
	if ((fp = popen(buf, "w")) == NULL) {
		log_errno(buf);
		(void) unlink(tmpfile);
		xfree(tmpfile);
		xfree(tmp);
		return NULL;
	}
	fwrite(tmp, 1, sz, fp);
	fputc('\n', fp);
	pclose(fp);

	/* Now read in the tmpfile to get the results */
	if (stat(tmpfile, &sb) < 0) {
		log_errno(tmpfile);
		(void) unlink(tmpfile);
		xfree(tmpfile);
		xfree(tmp);
		return NULL;
	}
	result = xmalloc(sb.st_size + 1);
	if ((fp = fopen(tmpfile, "r")) == NULL) {
		log_errno(tmpfile);
		(void) unlink(tmpfile);
		xfree(tmpfile);
		xfree(tmp);
		xfree(result);
		return NULL;
	}
	/*
	 *  Filter the output by removing any word with length <= 2.
	 */
	i = 0;
	while (fgets(buf, BUFSIZ, fp) != NULL) {
		wordsz = strlen(buf);
		if (wordsz <= 3)	/* word of length 2 plus \n */
			continue;
		if (wordsz > 25)	/* word of length more than 25 */
			continue;
		memcpy(&result[i], buf, wordsz);
		i += wordsz;
	}
	result[i] = '\0';
	fclose(fp);
	(void) unlink(tmpfile);
	xfree(tmpfile);
	xfree(tmp);
	return result;
}
