/*
 * scanbody.c
 *
 * Code to perform the scanning of the body of news articles, searching for
 * precompiled regular expressions, and collecting facts about the article,
 * such as number of lines in the body, number of quoted lines, size of
 * the signature, and so on.
 *
 * The main body of the article is read into a chunk of temporary memory,
 * in a completely raw, unformatted manner.  This chunk of text is then
 * massaged according to the settings of some global flags, and data
 * structures indicating lines, paragraphs, and specific areas of the
 * article are constructed.
 */

 /*
  * Newsclip(TM) Library Source Code.
  * Copyright 1989 Looking Glass Software Limited.  All Rights Reserved.
  * Unless otherwise licenced, the only authorized use of this source
  * code is compilation into a binary of the newsclip library for the
  * use of licenced Newsclip customers.  Minor source code modifications
  * are allowed.
  * Use of this code for a short term evaluation of the product, as defined
  * in the associated file, 'Licence', is permitted.
  */

#include "common.h"
#include "body.h"
#include "rei.h"

#define talloc		temp_alloc	/* Some convenient abbreviations. */
#define palloc		perm_alloc

extern char *temp_alloc AC(( int ));
extern char *perm_alloc AC(( int ));
extern FILE *get_body_desc AC(( int ));

extern int preserve_case;	/* If set: treat paragraphs as the basic unit */
extern int white_compress;	/* If set: strip leading/trailing whitespace */
extern int paragraph_scan;	/* If set: map all text to lower case */
extern char *TREBuff;		/* Temporary buffer for compiled temp REs */

/*
 * We define the following terms:
 * 
 * line -- a line as it was originally typed in the article, and as such,
 * 	terminated with a newline.
 * 
 * area -- a particular subset of the article lines.  The recognized areas
 * 	currently are: 
 * 		"signature"
 * 		"text"
 *		"included" (text lines starting with "include_prefix")
 * 		"newtext" (original text only -- not include-prefixed)
 * 		"body" (text and signature)
 * 
 * unit -- an ASCIIZ string of text, with newlines removed.  A unit is the
 * 	basic search unit for R.E.s, and currently can be either
 * 		"lines" -- corresponding to the true lines of the article area,
 *  	 or
 * 		"paragraphs" -- multiple lines joined without newlines
 *		 according to the rules
 * 		   i) New paragraphs commence at the beginning of a new area.
 * 		  ii) New paragraphs commence after an empty line.
 * 		 iii) New paragraphs commence when the indentation of the
 * 			current line exceeds the indentation level of the
 * 			previous line.
 * 
 * Basically, we look through the article a byte at a time, looking for
 * areas.  Once an area is identified, then the area is divided up into
 * units.  Hopefully, all this can be done with a minimum of movement of
 * bytes and characters.
 * 
 */ 

area_type *Article;		/* Ptr to area-formatted version of article */
area_type *RawText;		/* Ptr to raw text version of article */

long ArticleStats[2][AS_ARR_SIZE]; /* Storage for various article statistics. */

/* A lot of static variables are used to maintain information about
 * the current state of reading of the article body.  These permit the
 * article body to be read in an incremental manner. */

static int	FirstLine;	/* First complete line of article parsed */
static int	LastLine;	/* Last complete line of article parsed */
static int	PfxScan;	/* Is line identification to be done? */
static int	IWhiteCompress;	/* Internal version of white_compress flag */
static rxp_type InclRxp = (rxp_type) NULL; /* Compiled RE to id incl. lines */
static rxp_type SignRxp = (rxp_type) NULL; /* Compiled RE to id sig. lines */

/* Variables used to store parsing status information over buffer breaks. */

static area_type *Ap;		/* Save: current area. */
static int	  Lind;		/* Save: indent level of last complete line. */
static int	  Ltyp;		/* Save: type of the last complete line. */
static u_list	 *Lul;		/* Save: last u_list of current area. */
static char	 *Bptr;		/* Save: pointer into current buffer. */
static unsigned int Blen;	/* Save: remaining bytes in current buffer. */
static int 	InSignature;	/* Are we in the signature of the article? */
static int	ArticleParsed;	/* Indicates if article has been parsed. */
static int	StatsDone;	/* Indicates if all article stats are done. */
static char	LineSplit;	/* Was long line split? (warning ctl only) */

static FILE *Fptr;		/* File pointer for current article. */
int ArticleEoF;			/* Article end-of-file indicator. */

static u_list 	   *flush_lines AC(( area_type *, u_list *, char **, int ));
static unsigned int block_read AC(( char ** ));
static char 	   *copy_line AC(( char *, char * ));

/* prepare_body() is called once per article, and resets the
 * variables of interest to the article body parsing routines. */

void
prepare_body( io_mode )
int io_mode;
{
	FirstLine = LastLine = Blen = Lind = 0;
	Ltyp = LT_NONE;
	LineSplit = StatsDone = ArticleParsed = InSignature = ArticleEoF = 0;
	Fptr = (FILE *) NULL;

	Ap = Article = (area_type *) NULL;
	RawText = (area_type *) talloc( sizeof(area_type) );
	zero( RawText, sizeof(area_type) );
	RawText->txt_typ = LT_BODY;
	Lul = (u_list *) NULL;

	TREBuff = (char *) NULL;

	zero( ArticleStats, sizeof(ArticleStats) );

	/* Set the internal version of the whitespace compression flag
	 * on the basis of whether or not paragraph scanning mode is on. */

	IWhiteCompress = paragraph_scan ? 1 : white_compress;

	PfxScan = InclRxp || SignRxp;
}

/* set_include_prefix() sets the include prefix, used for determining which
 * lines of the article are included, to the given string argument. */

void
set_include_prefix( user_pfx )
char *user_pfx;
{
	char *inc_pfx;

	if( InclRxp )
		/* Free old compiled version of the include prefix string */
		perm_free( InclRxp );

	if( '^' == *user_pfx ) {
		inc_pfx = user_pfx;
		}
	else {
		/* Need to make a copy of the user's string in order
		 * to insert the left-end anchor character. */
		inc_pfx = talloc( strlen( user_pfx ) + 2 );
		inc_pfx[0] = '^';
		strcpy( inc_pfx + 1, user_pfx );
		}

	InclRxp = *inc_pfx ? REG_COMP_P( inc_pfx ) : (rxp_type) NULL;

	/* Update the prefix scanning flag to indicate whether or not
	 * scanning is to be done for included or signature lines. */

	PfxScan = InclRxp || SignRxp;
}

/* set_signature_start() sets the signature pattern, used for determining where
 * the signature of the article starts, to the given string argument. */

void
set_signature_start( sig_strt )
char *sig_strt;
{
	if( SignRxp )
		/* Free old compiled version of the include prefix string */
		perm_free( SignRxp );

	SignRxp = *sig_strt ? REG_COMP_P( sig_strt ) : (rxp_type) NULL;

	/* Update the prefix scanning flag to indicate whether or not
	 * scanning is to be done for included or signature lines. */

	PfxScan = InclRxp || SignRxp;
}

/* read_body() performs the first-level analysis of the article body, by
 * reading the article and counting bytes and lines.  Lines are stored
 * individually as ASCIIZ strings in preparation for any necessary
 * higher-level processing, such as article section parsing (handled by
 * parse_body() below). */

void
read_body( start, end )
unsigned int start, end;
{
	register char *nbptr;	/* Updated pointer into buffer */
	register int nblen;	/* Updated size of buffer */
	char *lps;		/* Ptr to start of the current line. */
	char *lpe;		/* Ptr to the end of the current line. */
	int len;		/* Length of the current line. */
	char *lptr[ULB_SIZE];	/* Temporary storage for located line ptrs. */
	short lidx = 0;		/* Index into lptr array. */

	if( !Fptr ) {
		/* If incremental reading is ever implemented, the
		 * argument to get_body_desc() will need to be computed
		 * intelligently, probably based somehow on end. */
		Fptr = get_body_desc( MAXINT );
		}

	while( end > LastLine ) {

		if( !Blen ) {
			/* The current block has been completely scanned;
			 * if EoF hasn't been reached, attempt to read
			 * another block.  If nothing further is read,
			 * then EoF has been encountered, and the reading
			 * process is complete. */

			if( ArticleEoF || !(Blen = block_read( &Bptr )) ) {
				LastLine = MAXINT;
				break;
				}
			}

		lps = nbptr = Bptr;
		if( preserve_case ) {
			for( nblen = Blen; nblen-- && *nbptr++ != '\n'; )
				;
			}
		else {
			for( nblen = Blen; nblen-- && *nbptr != '\n'; nbptr++ )
				*nbptr = lowcase( *nbptr );
			nbptr++;
			}

		if( nblen < 0 ) {
			/* End of buffer reached; it is necessary to read
			 * another block from the article and continue
			 * the search for the end of this line. */
			register char *tptr;		
			int tlen, spl;
			char *addressptr;

			if( !ArticleEoF ) {
				nblen = block_read( &addressptr );
				nbptr = addressptr;
				}
			else {
				/* Hmm.  End of file has been encountered, but
				 * the end of the current line has not.  Handle
				 * this by just faking in the final newline. */
				warning(2, "premature end-of-file encountered");
				nblen = 1;
				nbptr = "\n";
				}

			tlen = 0;
			if( preserve_case ) {
				for( tptr = nbptr; tlen++ <= nblen &&
							 *tptr++ != '\n'; )
					;
				}
			else {
				for( tptr = nbptr; tlen++ <= nblen &&
						  *tptr != '\n'; tptr++ )
					*tptr = lowcase( *tptr );
				tptr++;
				}

			if( tlen > nblen ) {
				/* Hmm.  It appears that the current line
				 * spans more than one full buffer.  Sorry,
				 * but the line will have to be split. */
				if( !LineSplit ) {
					warning( 2,
					 "line(s) too long -- line(s) split" );
					LineSplit = 1;
					}
				
				tlen = nblen;	/* Truncate line length. */
				spl = 1;	/* Byte for added "newline". */
				}
			else {
				spl = 0;	/* Extra byte not required. */
				}

			lps = talloc( (Blen + tlen + spl)*sizeof(char) );
			memcpy( lps, Bptr, Blen );
			memcpy( lps + Blen, nbptr, tlen );
			len = Blen + tlen + spl - 1;
			lpe = lps + len;
			Bptr = tptr;
			Blen = nblen - tlen;
			}
		else {
			Bptr = nbptr;
			Blen = nblen;
			lpe = Bptr - 1;
			len = 1 + (lpe - lps);
			}

		LastLine++;		/* We've found one more line... */
		ArticleStats[ID_LINES][LT_BODY]++;
		ArticleStats[ID_BYTES][LT_BODY] += len;

		/* Insert the line into the current area. */
		
		lptr[lidx++] = lps;  /* Stash ptr to start of line. */
		*lpe = '\0';	     /* Stomp null over end newline. */
		RawText->size += len; /* Add line length to area size. */

		if( lidx >= ULB_SIZE ) {
			/* Line buffer is full; allocate "permanent"
			 * storage and copy ptrs to the new buffer. */

			Lul = flush_lines( RawText, Lul, &lptr[0], lidx );
			lidx = 0;
			}
		}

	if( lidx ) 
		Lul = flush_lines( RawText, Lul, &lptr[0], lidx );
}

/* parse_body() divides the [already-read] article body into its component
 * subsections of included text, new text, and signature text, and updates
 * statistics on the various areas.  The first level of paragraphing is also
 * performed here -- lines are associated into paragraphs. */

void
parse_body( start, end )
unsigned int start, end;
{
	char *lps;
	int ind, len, j;
	char *lptr[ULB_SIZE];	/* Temporary storage for located line ptrs. */
	short lidx = 0;		/* Index into lptr array. */
	u_list *ul;
	int typ;
	area_type *nap;

	if( ArticleParsed )
		return;

	if( !ArticleEoF )
		/* Ensure that the entire article body has been read. */
		read_body( 1, MAXINT );

	for( ul = RawText->list; ul; ul = ul->next ) 
	    for( j = 0; j < ul->size; j++ ) {

		typ = IDLine( ul->u_txt[j], &lps, &ind, &len );

		if( InSignature ) {
			/* If "signature_start" has already been seen, then
			 * all subsequent lines are really signature lines. */
			typ = LT_SIGNATURE;
			}
		else if( typ == LT_SIGNATURE ) {
			/* The signature line has just been seen.  Set the
			 * flag to indicate that all subsequent lines are
			 * part of the article signature, and set PfxScan
			 * to zero so that no further line identification
			 * will be attempted. */
			PfxScan = 0;
			InSignature++;
			}

		ArticleStats[ID_LINES][typ]++;
		ArticleStats[ID_BYTES][typ] += 1 + len;

		if( (typ != Ltyp) || (paragraph_scan && (ind > Lind)) ) {
			/* A new paragraph or text area has been encountered. */

			if( Ltyp != LT_NONE && lidx ) {
				/* Flush accumulated lines into old area. */
				(void) flush_lines( Ap, Lul, &lptr[0], lidx );
				lidx = 0;
				}

			Lul = (u_list *) NULL;

			if( !Ap || Ap->list ) {
				/* Allocate new area/paragraph encountered, and
				 * link it onto the list of article areas. */
				nap = (area_type *) talloc( sizeof(area_type) );
				if( Article )
					Ap = Ap->next = nap;
				else
					Ap = Article = nap;
				}

			zero( Ap, sizeof(area_type) );
			Ltyp = Ap->txt_typ = typ;
			}

		if( len ) {
			/* Insert the line into the current area, if it
			 * isn't an empty line. */
		
			lptr[lidx++] = lps;  /* Stash ptr to start of line. */
			Ap->size += len + 1; /* Add line length to area size. */
			}
		else {
			/* Set indentation to ensure next
			 * line will start a new paragraph. */
			ind = -1;
			}

		Lind = ind;		/* Save the level of line indent. */

		if( lidx >= ULB_SIZE ) {
			/* Line buffer is full; allocate "permanent"
			 * storage and copy ptrs to the new buffer. */

			Lul = flush_lines( Ap, Lul, &lptr[0], lidx );
			lidx = 0;
			}
	    }

	if( lidx ) 
		Lul = flush_lines( Ap, Lul, &lptr[0], lidx );

	ArticleParsed = TRUE;
}

/* init_stats() ensures that the article statistics table is up-to-date,
 * by performing whatever level of analysis is required to obtain the
 * statistics for the specified article region. */

void
init_stats( statid )
int statid;
{
	if( statid == LT_BODY )
		read_body( 1, MAXINT );
	else if( !StatsDone ) {
		if( !ArticleParsed )
			parse_body( 1, MAXINT );
		ArticleStats[ID_LINES][LT_TEXT] = 
			ArticleStats[ID_LINES][LT_NEWTEXT] +
			ArticleStats[ID_LINES][LT_INCLUDED];
		ArticleStats[ID_BYTES][LT_TEXT] = 
			ArticleStats[ID_BYTES][LT_NEWTEXT] +
			ArticleStats[ID_BYTES][LT_INCLUDED];
		StatsDone++;
		}
}

/* IDLine() "parses" the next line starting from rptr, which points to
 * the line to be parsed, as an ASCIIZ string.  Pointers to the actual
 * start of the string (white-space and/or include_prefix trimmed), the
 * level of indentation and the line length are returned; the "type" of
 * the line -- included, signature or newtext -- is returned explictly. */

static int
IDLine( rptr, start, indent, length )
char *rptr;				/* Start of unidentified text line */
char **start;	 			/* Returned ptr to line start. */
int *indent, *length;			/* Returned line indent, length */
{
	register char *ptr = rptr;	/* Line scanning pointer */
	int id = 0;			/* Level of line indentation */
	int ltype;			/* Area to which the line belongs */
	int llen = 0;			/* Line length computed. */
	char *eptr;			/* Ptr to last character in line. */

	/* The first task is to determine the level
	 * of indentation of this particular line. */

	while( *ptr ) {
		if( *ptr == ' ' )
			id++;
		else if( *ptr == '\t' )
			id = (id + 8) % 8;
		else if( *ptr == '\f' ) {
			id = MAXINT;
			break;
			}
		else
			break;
		ptr++;
		}

	/* Indicate where the line starts (either stripped or actual). */
	*start = IWhiteCompress ? ptr : rptr;

	/* The next task is to identify the area with which the line is
	 * associated.  If it is known that both of the include_prefix and
	 * signature_start strings are empty, then the line type is
	 * automatically assigned as NEWTEXT. */

	if( PfxScan ) {
		/* Look for a match on "include_prefix" */
	
		/* Search starts after whitespace is skipped, always. */
		if( InclRxp && REG_EXEC( InclRxp, ptr ) ) {
			ltype = LT_INCLUDED;
			if( paragraph_scan ) {
				*start = InclRxp->endp[0];
				}
			}
		/* Search starts at true beginning of line, always. */
		else if( SignRxp && REG_EXEC( SignRxp, rptr ) )
			ltype = LT_SIGNATURE;
		else
			ltype = LT_NEWTEXT;
		}
	else {
		ltype = LT_NEWTEXT;
		}


	llen = strlen( *start );
	if( IWhiteCompress && llen ) {
		/* If IWhiteCompress is set, then ensure that ptr is updated
		 * to point at the end of the line in preparation for the
		 * trimming of any trailing whitespace from the line. */
		while( *ptr )
			*ptr++;
		eptr = ptr;
		/* Scan back over any end-of-line whitespace */
		for( ; llen && isspace(*(eptr-1)); llen--, eptr-- )
			;
		}

	/* Line successfully scanned... update the return
	 * values and explicitly return the line type. */

	*length = llen;
	*indent = id;

	return( ltype );
}

/* flush_lines() just copies accumulated pointers into "permanent" storage. */

static
u_list *
flush_lines( ap, lul, lptr, size )
area_type *ap;				/* Area to which the lines belong. */
u_list *lul;				/* Last unit belonging to area. */
char **lptr;				/* Pointer to the buffered lines. */
int size;				/* Number of lines in buffer */
{
	u_list *ul;

	if( !size ) {
		/* Return immediately if there are no buffered lines. */
		return( lul );
		}

	/* Fetch memory to hold the line list, and fill in the fields. */
	ul = (u_list *) talloc( sizeof(u_list) + (size-1)*sizeof(char *) );
	ul->size = size;
	ul->next = (u_list *) NULL;

	/* Copy the buffered pointers to the allocated memory. */
	memcpy( &ul->u_txt[0], lptr, size*sizeof(char *) );
	if( lul )
		lul->next = ul;	/* Extend list of lines */
	else
		ap->list = ul;	/* First list of lines */

	return( ul );
}

/* block_read() obtains a block of article text at a time, for analysis. */

static
unsigned int
block_read( ptr )
char **ptr;
{
	unsigned int len;

	*ptr = talloc( BLOCK_SIZE*sizeof(char) );

	if( !(len = fread( *ptr, sizeof(char), BLOCK_SIZE, Fptr ))
	    && ferror( Fptr ) )
		error( "file error during read" );

	if( len < BLOCK_SIZE )
		ArticleEoF++;

	return( len );
}

/* paragraphize() copies the lines of the given area into a consecutive
 * ASCIIZ string, suitable for regular expression searches in paragraph
 * scanning mode. */

void
paragraphize( ap )
area_type *ap;
{
	register char *wptr;
	register u_list *ul;
	register int l;

	wptr = ap->para = talloc( (ap->size + 1)*sizeof(char) );
	for( ul = ap->list; ul; ul = ul->next ) 
		for( l = 0; l < ul->size; l++ )
			wptr = copy_line( wptr, ul->u_txt[l] );

	if( ap->para != wptr )
		/* Remove the unnecessary trailing blank. */
		wptr--;

	*wptr = '\0';
}

/* copy_line() copies characters starting from rptr to wptr, compressing
 * internal whitespace regardless of the setting of the IWhiteCompress
 * flag.  It is known that having paragraph_scan set implies that
 * IWhiteCompress is also set. */

static char *
copy_line( wptr, rptr )
register char *wptr;		/* Ptr to target memory for the line copy. */
register char *rptr;		/* Ptr to the line to copy. */
{
	char *origptr = wptr;

	while( *rptr && isspace(*rptr) )
		rptr++;

	while( *rptr ) {
		while( *rptr && !isspace(*rptr) )
			*wptr++ = *rptr++;
		while( *rptr && isspace( *rptr ) )
			rptr++;
		if( *rptr ) {
			*wptr++ = ' ';
			}
		}

	if( origptr != wptr ) {
		/* Add a trailing space only if we moved the write pointer. */
		*wptr++ = ' ';
		}

	return( wptr );
}

#ifdef DEBUG

char *AreaNames[] = {
	"LT_NONE",
	"LT_SIGNATURE",
	"LT_INCLUDED",
	"** illegal **",
	"LT_NEWTEXT",
	"** illegal **",
	"** illegal **",
	"LT_BODY"
	};

void
dump_body()
{
	area_type *ap;
	u_list *ul;
	int i = 1, j;

	for( ap = Article ? Article : RawText; ap; ap = ap->next, i++ ) {
		printf("{Paragraph %d (type %s)}\n",i,AreaNames[ap->txt_typ]);
		if( !paragraph_scan ) {
			for( ul = ap->list; ul; ul = ul->next ) 
				for( j = 0; j < ul->size; j++ )
					printf( ">%s<\n", ul->u_txt[j] );
					/* puts( ul->u_txt[j] ); */
			}
		else {
			if( !ap->para )
				paragraphize( ap );
			printf( ">%s<\n", ap->para );
			/* puts( ap->para ); */
			}
		}
}

#endif /*DEBUG*/
