/*
 * $Header: /home/aggarwal/C/UNIX/grep_record/RCS/grep_record.c,v 1.10 1992/05/12 13:19:43 aggarwal Exp $
 */

/*+
 ** This program is a modification of the grep.c program and its
 ** primary use is in searching records. Most of the usage remains the
 ** same as for 'grep', with the exception that now the user has to
 ** specify the start r.e. and the end r.e. on the command line.
 ** By default the start regexp and end regexp are '^' and '$', so
 ** the program behaves like a 1-line context 'grep'.
 **
 ** Exit codes 0 for match, 1 for no match and 2 for errors
 **
 ** AUTHOR:
 **	Vikas Aggarwal, vikas@jvnc.net, January 18, 1990
 */

/*
 *
 * $Log: grep_record.c,v $
 * Revision 1.10  1992/05/12  13:19:43  aggarwal
 * Added the  missing 3rd argument to the call of do_line on line 388
 *
 * Revision 1.9  1992/05/10  21:32:05  aggarwal
 * Fixed the '-q' (quiet) option.
 *
 * Revision 1.8  1992/05/10  20:34:54  aggarwal
 * Small tweaking of the 'b' and 'n' options for consistency.
 *
 * Revision 1.7  1992/05/07  20:36:55  aggarwal
 * Moved 'types.h' to top and also changed name of variable. Ultrix
 * was complaining.
 *
 * Revision 1.6  1992/04/27  19:13:29  aggarwal
 * Made sure that the exit status was valid.
 *
 * Revision 1.5  1992/04/24  22:48:00  aggarwal
 * Uses 'fgets' instead of trying to do my own.
 *
 * Revision 1.4  1992/04/23  20:18:05  aggarwal
 * This version uses the GNU 'regex.c' library.
 * It is extremely slow due to the GNU library code. As a comparison:
 * 	time ./grep_record.1.4 'Ae\|Qe\|Oe\|Xe\|Ze' /usr/local/dict/words
 * 	187.8u 3.5s 6:32.42 48.7% 0+112k 1+1io 0pf+0w
 *
 * However, it does extended pattern matching.
 *
 * Revision 1.2  1990/04/02  12:33:25  aggarwal
 * 1) Changes to the way start of record is handled if vflag
 * 2) Default start and end of record are newlines "^"
 *
 * Revision 1.1  90/03/28  11:13:18  aggarwal
 * Initial revision
 *
 *
 */


#ifndef lint
 static char rcsid[] = "$RCSfile: grep_record.c,v $ $Revision: 1.10 $ $Date: 1992/05/12 13:19:43 $" ;
#endif


#include <sys/types.h>
#include <ctype.h>
#include <stdio.h>
#include <strings.h>
#include <sys/stat.h>
#include <sys/file.h>
#include "regexp.h"

#ifndef BUFSIZ			/* Usually defined in stdio.h */
#define BUFSIZ 1024
#endif  /* BUFSIZ */

#ifndef BLKSIZ
#define BLKSIZ  BUFSIZ * 8	/* to read in a block of data from file */
#endif

#ifndef SEEK_SET		/* for 'lseek', L_SET is old */
#define SEEK_SET L_SET
#endif

#ifndef BYTEWIDTH
#define BYTEWIDTH 8		/* size of a BYTE, for fastmap & translate */
#endif

#ifndef _NOTCHAR
#define _NOTCHAR (1 << BYTEWIDTH)	/* First integer value that is
					   greater than any char code */
#endif  /* NOTCHAR */


/* Exit status codes. */
#define MATCHES_FOUND 0         /* Exit 0 if no errors and matches found. */
#define NO_MATCHES_FOUND 1      /* Exit 1 if no matches were found. */
#define ERROR 2                 /* Exit 2 if some error occurred. */

#define Fprintf(_s)	fprintf(stderr, "%s", (_s))

/* Symbolic representation of the three regular expressions */
enum  reg_expressions { START, END, BODY } ;

extern char *optarg ;
extern int optind, opterr;
extern int errno ;
char *lowercase_string() ;	/* returns ptr to static buffer */

struct  regexp *re_compiled[3];	/* compiled regular expr */

static off_t byte_count[3] ;	/* byte offset from start of file */
static off_t cur_offset ;	/* current offset in file */

static FILE *ofile ;		/* output filename */
static char *cur_file ;		/* current filename that is being processed */
static int eof ;	      	/* indicate end of file for cur_file */
static int lineno ;		/* present line number */
static int found;		/* indicate that match found */
static int first_match ;	/* toggle for printing out the filename */
static int numfiles ;		/* total input files (to print name) */
static int exit_status ;

/* *** Flags *** */
static int bflag ;
static int debug, debug2 ;
static int nflag ;		/* print line numbers from file also  */
static int quiet ;
static int ignore_case;
static int vflag ;

char *prognm ;

usage_and_die()
{
    fprintf(stderr, "Usage: %s ", prognm);
    Fprintf("[-b (byte offsets)]  [-d (debug)] [-q quiet]\n");
    Fprintf("\t[-i (ignore case)]  [-n (line numbers)] [-o <output file>]\n");
    Fprintf("\t[-s <start of record expr>] [-e <end of record exp>]\n");
    Fprintf("\t[-v (print not matching)]  <regular expr> [files...]\n");
    exit (ERROR);
}


main (ac, av)
     char **av;
{
    int c, i;
    char *(regexp_str[3]) ;	/* array of ptrs to strings */
    char *translate = NULL;	/* translate table for case conversion */

    prognm = av[0] ;
    exit_status = NO_MATCHES_FOUND ;

    for (i = 0 ; i <= BODY ; ++i)
      regexp_str[i] = NULL ;

    while ( (c = getopt(ac, av, "bdino:qs:e:v")) != EOF)
      switch (c)
      {
       case 'b':		/* print byte numbers		*/
	  bflag++;
	  break;
	  
       case 'd':		/* debug flag			*/
	  if (debug)
	    debug2++ ;		/* higher debug level */
	  else
	    debug++ ;
#ifndef lint
	  if (debug2)
	    fprintf(stderr, "VERSION: %s\n", rcsid) ;
#endif
	  break;
	  
       case 'i':		/* ignore case			*/
	  ignore_case++;
	  break;
	  
       case 'n':		/* line numbers printed		*/
	  nflag++;
	  break;

       case 'o':		      	/* Output file  */
	  ofile = fopen (optarg, "w");	/* Truncate if exists	*/
	  if (ofile == NULL)		/* couldn't open file	*/
	  {
	      fprintf(stderr,"%s (main): write open ", av[0]);
	      perror(optarg);
	      exit (ERROR);
	  }
	  break;
       case 'q':		/* nothing output, just the exit status */
	  quiet++;
	  break;
	  
       case 's':		/* 'start of record' expression	*/
	  regexp_str[START] = optarg;
	  break ;
	  
       case 'e':		/* 'end of record' expression	*/
	  regexp_str[END] = optarg;
	  break ;
	  
       case 'v':		/* all lines NOT matching	*/
	  vflag++;
	  break;

       case '?':
       default:
	  fprintf(stderr, "Error (%s): unknown flag %c\n", av[optind]);
	  usage_and_die();
	  break;
      }

    if (ofile == NULL)		/* use standard output */
      ofile = stdout ;

    if (debug)
      quiet = 0 ;		/* cannot have quiet with debug */

    if (optind >= ac)		/* no regular expression specified */
      usage_and_die() ;

    regexp_str[BODY] = av[optind++];

    if ( (numfiles = (ac - optind)) <= 0 )	/* no input files specified */
    {
	fprintf(stderr, "(%s) ERROR: Input files not specified\n", prognm);
	exit (ERROR);
    }

    /*
     * Set default start and end patterns (a newline)
     */
    if (regexp_str[START] == NULL)
      regexp_str[START] = "^" ;

    if (regexp_str[END] == NULL)
      regexp_str[END] = "$" ;

    /*
     * compile all the regular expressions...
     */
    regexp_compile_all(regexp_str, translate) ;

    /*
     * Process each of the files specified
     */
	
    while (optind < ac)
      do_file( (cur_file = av[optind++]) ) ;	/* refer cur_file */

    exit (exit_status) ;

}	/* end main() */



/*+ 		regexp_compile_all
 ** FUNCTION:
 **	Compile all the three regular expressions. Essentially prep
 ** for the regular expression searches later.
 **/

regexp_compile_all(regexp_str, translate)
     char **regexp_str ;     	/* array of regular expressions */
     char *translate ;		/* the translate table for 'case' */
{
    int i ;

    for (i = START ; i <= BODY ; ++i)
    {
	if (ignore_case)
	  strcpy (regexp_str[i], lowercase_string(regexp_str[i]) ) ;
	re_compiled[i] = regcomp (regexp_str[i]) ;

#ifdef DEBUG
	if (debug2)			/* This dumps  to stdout */
	{
	    fprintf(stderr, "------- COMPILED RE --------\n") ;
	    regdump( re_compiled[i] );	/* defined if DEBUG in regexp.c */
	}
#endif
    }	/* end for() */

}	/* end regexp_compile_all */


/*+ 		regexp_srch
** FUNCTION:
** 	Search for the regular expression in the given string and
** return a 1 or 0 for found or not found. Errors are treated as
** not-found.
** This is simply an easier frontend for 're_search'
**/

regexp_srch(regexpp, str)
     register struct regexp *regexpp ;
     char *str ;
{
    int retloc ;

    retloc = regexec ( regexpp, str ) ;	/* search for reg expression */

#ifdef DEBUG
	if (debug2)
	  fprintf (stderr, 
		   "(debug) [%d]: Matched regexp ->%.*s\n",
		   lineno, regexpp->endp[0] - regexpp->startp[0],
		   regexpp->startp[0] ) ;
#endif

    return (retloc) ;

}	/* end regexp_srch() */


/*+ 		regerror
 ** FUNCTION:
 **	Needed by the regex routines.
 **/
void regerror(s1)
     char *s1 ;
{
    fprintf(stderr, 
	    "(%s) regular expression error[%d]: %s\n", prognm, lineno, s1);

    exit(ERROR);
}



/*+ 		do_file
 ** FUNCTION:
 ** 	Do one complete file according to all the options specified.
 ** Method:
 **	o Read in one line at a time
 **	o search for expressions and store byte offsets
 **	o if to be printed, set flag
 **	o search for end expression
 **	o fseek and print out
 **	o repeat searching for expressions
 */
do_file(f)
     char *f ;
{
    FILE *ifile ;			/* Input file for 'fgets' */
    int fd1, fd2 ;			/* two file desciptors */
    register int linelen ;
    static char line[BUFSIZ];		/* string read from file */

    /* Reset the various variables... */
    lineno = 0 ; cur_offset = 0 ;
    byte_count[START] = 0 ; byte_count[END] = 0;
    found = 0 ;
    eof = 0 ;
    first_match = 1 ;	      		/* to print filename first time */

    fflush (ofile) ;

    /*
     * Open the file two times- one for getting lines in and the other
     * to do the 'lseek' for printing out.
     */
    if ( (fd2 = open(f, O_RDONLY)) >= 0  && (ifile = fopen(f, "r")) != NULL)
      ;
    else
    {
	fprintf(stderr, "%s: Error in read openfile\n", prognm);
	perror(f);
	return (ERROR);
    }


    /*
     * Now go on reading lines till end of file is reached
     */
    while (fgets(line, BUFSIZ, ifile) != NULL)
    {
	linelen = strlen(line) ;	/* if using stdio's fgets */
	++lineno ;			/* set to present line number */

	do_line(fd2, line, linelen) ;	/* search & print if needed */

	cur_offset += linelen;		/* update current offset */

    }
    
    eof = 1 ;				/* set end of file counter */
    do_line(fd2, "", 0);		/* Send over an empty string */

    fclose (ifile);			/* close all open files */
    close (fd2);

    return (1) ;

}	/* end do_file()  */



/*+ 		do_line
** FUNCTION:
** 	Process one line and act according to what is found in the
** line. It needs to be called at the end of the file also (with the
** 'eof' set since it treats EOF as an END match always and prints out
** if conditions pertain.
**/

do_line (fd, in_line, linelen)
     int fd ;			/* file descriptor for lseek-ing, etc. */
     char *in_line ;	     	/* string to search in */
     int linelen ;		/* len of string (including \n) */
{
    static char sline[BUFSIZ];	/* in case need to lowercase string */
    static char tmp_line[BUFSIZ];	/* tmp line for debug */
    register char *line ;      	/* fast ptr to regular/lowercase line */

    /*
     * The spencer library matches '$' to the '\0' at the end of the
     * string. Hence need to convert the end '\n' to a '\0'. Note that
     * here it is operating on the read line and not a copy of it.
     */
    if ( (line = (char *)strchr(in_line, '\n')) != NULL)
      *line = '\0' ;

    if (ignore_case)
      strcpy(sline, lowercase_string(in_line)) , line = sline ;
    else
      line = in_line ;
      
    /* Now search in sequence for the three r.e.'s - start, end
     ** and the body r.exp. If a match is found, then print record.
     ** Like a little finite state machine chugging away here.
     **
     ** LOGIC:
     **	1) If record to be printed (found == 1), then search only
     **	   for end of record.
     **	2) If (found != 1), ignore all EndOfRecord, search for 
     **	   regexp, and if start_of_rec found, update pointer.
     **
     ** CAVEATS:
     ** 1) A simple matter for debate, but if vflag set, then if two
     ** startexp occur without a endrec in between, then ignore the
     ** second startrec (other option is to move start_ptr down).
     **
     ** 2) Cannot have the start, body, end expressions on the same line-
     ** this is because the START/END is not searched on the same line as
     ** the BODY, instead the line number is incremented.
     **
     ** 3) Note also that the BODY is searched before the START and there
     ** is no reset indicating that the start field was matched. Then the
     ** offset after the previous END record is treated as the START. What this
     ** means is that if a START is missing, then the START is set to the
     ** end of the previous END offset.
     **
     **	START BODY END BODY2 END2 ==> START  BODY END BODY2 END2
     */

    if (!found && eof)		/* don't even try to match reg exp */
      return (0) ;

    if (!vflag)		/* differant m/c for NOT matching	*/
    {
	switch (found)
	{
	 case 1:			/* Look only for end of record	*/
	    if ( eof || regexp_srch(re_compiled[END], line) )
	    {
		if (debug)
		  fprintf(stderr,"(debug) [%d]: Found END->%s\n",lineno, line);
		byte_count[END] = cur_offset + linelen;		/* note add */

		/* use diff file descriptor for printing out */
		print_between_offsets(fd, byte_count[START],
				      byte_count[END]) ;
		found = 0;	  	     	/* reset the value	*/
		byte_count[START] = byte_count[END] ;
	    }
	    break;
	    
	 case 0:		/* look only for expr else start	*/
	    if (regexp_srch(re_compiled[BODY], line))
	    {			/* print the record found	*/
		++found ;
		byte_count[BODY] = cur_offset ;
		if (nflag && !quiet)
		  fprintf (ofile, ":: Line number:%ld ::\n", lineno);
		if (debug)
		  fprintf(stderr,
			  "(debug) [%d]: Found BODY->%s\nLast START->%s\n",
			  lineno, line, tmp_line) ;
	    }
	    else if (regexp_srch(re_compiled[START], line))
	    {			/* got start of record		*/
		byte_count[START] = cur_offset ; /* store new start */
#ifdef DEBUG
		if (debug2)
		  fprintf(stderr,
			  "(debug) [%d]: Found START->%s\n",lineno,line) ;
#endif
		if (debug)
		  sprintf (tmp_line, "[%d]: %s\n", lineno, line);
	    }
	    break ;
	    
	}	/* end:  switch */
	
    }	/* end:  if !vflag */
    
    /*
     * The code if we are supposed to print all those that are not
     * matching. Different from the earlier one.
     */
    if (vflag)
    {
	if (!found &&  regexp_srch( re_compiled[BODY], line ))
	  found++;
	else if ( regexp_srch( re_compiled[END], line ) )
	{
	    if (debug)
	      fprintf (stderr,"(debug) [%d]: Found END->%s\n", lineno, line);
	    byte_count[END] = cur_offset + linelen;	/* note add */
	    if (!found)		/* then print record */
	      print_between_offsets(fd, byte_count[START],
				    byte_count[END]) ;
	    found = 0;	  	     	/* reset the value */
	    byte_count[START] = byte_count[END] ;
	}
	else if (regexp_srch( re_compiled[START], line ))
	{
	    if (debug)
	      fprintf (stderr,"(debug) [%d]: Found START->%s\n", lineno, line);
	    if (found)		/* Assume new start	*/
	      byte_count[START] = byte_count[END] ;
	    else			/* not found, so ignore start */
	      ;
	    found = 0 ;		/* restart search if found */
	}

    }	/* end if vflag	 */

}	/* end do_line()  */


/*  */


/*+		print_between_offsets
 ** FUNCTION:
 **	Prints all between start_offset and end_offset in the open file.
 ** Also prints out the file name if necessary.
 **/

print_between_offsets(fd, start_offset, end_offset)
     int fd ;			/* file descriptor to open file		*/
     off_t start_offset, end_offset ;
{
    static char buf[BUFSIZ];
    int nchar_to_print = end_offset - start_offset ;

    exit_status = MATCHES_FOUND ;

    if (quiet)
      return (0) ;			/* nothing printed out */

    if (first_match && numfiles > 1)	/* print out filename only if..	*/
    {					/* ..lots of files and this is..*/
	first_match = 0 ;		/* ..the first record from file	*/
	fprintf (ofile, "::: %s :::\n\n", cur_file);
	fflush (ofile);
    }

    if (bflag)
      fprintf (ofile, ":: Start/Match/End offsets:%ld/%ld/%ld ::\n",
	      (long)start_offset, (long)byte_count[BODY], (long)end_offset);
    
    lseek (fd, start_offset, SEEK_SET) ;

    do					/* print out entire record	*/
    {
	register int nchar_read, c ;	

	nchar_read = read (fd, buf, BUFSIZ) ;
	c = 0 ;
	while (c < nchar_to_print && c < nchar_read)
	  putc (buf[c++], ofile) ;		/* make char +ve by &0377 */
	
	nchar_to_print = nchar_to_print - c ;	/* update remaining to print */
    } while (nchar_to_print) ;
    
    fflush (ofile);				/* very important here	*/
}   						/* end:  succeed	*/

/*+ 		lowercase_string
** FUNCTION:
** 	Convert the entire string to lowercase and return ptr to a
** static buffer with the line in lowercase.
**/
char *
lowercase_string( string )
     char *string ;
{
    static char static_string[BUFSIZ] ;
    register char *u, *v = static_string ;
    
    for (u = string ; *u != '\0' ; u++)		/* hope string is not longer */
      *v++ = ( isupper((int) *u) ? (char) tolower((int) *u) : *u) ;

    *v = '\0' ;

    return (static_string) ;

}		/* end lowercase_string */
