/*
 * repair.c -- Program which reconstructs scanned source, locates errors,
 *			   and tries to fix most of them automatically.  If it
 *             can't, it drops you into an editor on the appropriate
 *             line for manual correction.
 *
 * The editor is chosen in the first available way:
 * - The -e command-line argument takes a printf() format string
 *   to format the editor invocation command line.  E.g.
 *   "emacs +%u %s".  %u and %s must appear, in that order.
 * - Failing that, the default is "$VISUAL +%u %s"
 * - Failing that, the default is "$EDITOR +%u %s"
 * - Failing that, the program prints a message and exits.
 *
 * Given a file "foo", this produces "foo.out".  If errors cause
 * you to drop into the editor, it is renamed back to "foo.in" and
 * run through again.
 *
 * Copyright (C) 1997 Pretty Good Privacy, Inc.
 *
 * Designed by Colin Plumb, Mark H. Weaver, and Philip R. Zimmermann
 * Written by Colin Plumb
 *
 * $Id: repair.c,v 1.23 1997/07/09 15:07:49 colin Exp $
 */

#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <errno.h>

#include "crc.h"
#include "heap.h"
#include "mempool.h"
#include "subst.h"

/*
 * The internal form of a substitution.  These are stored on
 * lists indexed by the first character of the input substitution.
 */
typedef struct Substitution {
	struct Substitution *next;
	char const *input, *output;
	size_t inlen, outlen;
	HeapCost cost;
	FilterFunc *filter;
	unsigned int index;	/* Consecutive serial numbers */
} Substitution;

struct Substitution const substNull = { NULL, "", "", 0, 0, 0, 0 };

/*
 * This might get increased later to support multiple classes of
 * substitutions, for different contexts.  Currently, only one
 * is used.
 */
#define SUBST_CLASSES 1

/* List of substitutions, indexed by first character, plus a catch-all */
Substitution *substitutions[SUBST_CLASSES][0x101];

/*
 * The pool of Substitution structures.  Remains alive for the entire
 * execution of the program.
 */
static MemPool substPool;
static Substitution *substFree;
static unsigned int substCount = 1;	/* Preallcoate 0 to substNull */
static unsigned int substFirstDynamic;
#define SubstIsDynamic(s) ((s)->index >= substFirstDynamic)

/* Every possible single-character string */
static char substChars[512];
#define SubstString(c) (substChars+2*((c)&255))

/* Set the list of substitutions to empty */
static void
SubstInit(void)
{
	unsigned int i, j;

	memPoolInit(&substPool);
	substFree = 0;
	substCount = 0;
	for (i = 0; i < elemsof(substitutions); i++)
		for (j = 0; j < elemsof(*substitutions); j++)
			substitutions[i][j] = NULL;

	for (i = 0; i < 256; i++) {
		substChars[2*i] = (char)i;
		substChars[2*i+1] = 0;
	}
}

/*
 * For dynamically allocated substitutions, we maintain a free list.
 * Each substitution has a unique serial number.  These are retained
 * if a substitution goes on the free list, to keep substCount from
 * ratcheting upwards indefinitely while still guaranteeing uniqueness.
 */
static Substitution *
SubstAlloc(void)
{
	struct Substitution *subst = substFree;

	if (subst) {
		substFree = subst->next;
	} else {
		subst = memPoolNew(&substPool, Substitution);
		subst->index = substCount++;
	}
	return subst;
}

static void
SubstFree(Substitution *subst)
{
	subst->next = substFree;
	substFree = subst;
}

static Substitution *
MakeSubst(char const *input, char const *output, HeapCost cost,
	FilterFunc *filter, int class)
{
	struct Substitution *subst, **head;

	subst = SubstAlloc();
	subst->input = input;
	subst->output = output;
	subst->inlen = strlen(input);
	subst->outlen = strlen(output);
	subst->cost = cost;
	subst->filter = filter;

	head = &substitutions[class][input[class] & 255];
	subst->next = *head;
	*head = subst;
	return subst;
}

/*
 * For each entry in the raw array, turn { "abc", "def", 5" }
 * into cost-5 mappings of "a"->"d", "b"->"e" and "c"->"f".
 * If the output string is NULL, the characters are deleted.
 * An input string of NULL is the end of table delimiter.
 */
static void
SubstSingle(struct RawSubst const *raw, int class)
{
	char const *input, *output;
	int i, o;

	while (raw->input) {
		input = raw->input;
		output = raw->output;
		assert(!output || strlen(input) == strlen(output));

		while (*input) {
			i = *input++;
			o = output ? *output++ : 0;
			(void)MakeSubst(SubstString(i), SubstString(o), raw->cost,
			                raw->filter, class);
		}
		raw++;
	}
}

/*
 * For each entry in the raw array, turn { "abc", "def", 5" }
 * into a cost-5 mappings of "abc"->"def".
 * An input string of NULL is the end of table delimiter.
 */
static void
SubstMultiple(struct RawSubst const *raw, int class)
{
	while (raw->input) {
		(void)MakeSubst(raw->input, raw->output, raw->cost, raw->filter, class);
		raw++;
	}
}

/* Build the substitutions table */
static void
SubstBuild(void)
{
	SubstInit();
	SubstSingle(substSingles, 0);
	SubstMultiple(substMultiples, 0);
	substFirstDynamic = substCount;
}

/*
 * Create a new dynamic sunstitution.  First search to make
 * sure it doesn't already esist.
 */
static Substitution const *
SubstDynamic(char const *pattern, int c, int class)
{
	Substitution *subst;

	subst = substitutions[class][pattern[0] & 255];
	for (; subst; subst = subst->next) {
		if (subst->outlen == 1 && subst->output[0] == (char)c &&
			strcmp(pattern, subst->input) == 0)
				return subst;	/* Already exists */
	}
	/* Need to make a new one */
	return MakeSubst(pattern, SubstString(c), COST_INFINITY, NULL, class);
}




/*
 * The state of the parser.
 * Note that this is updates when a ParseNode is *removed* from the heap;
 * ParseNodes that are in the heap have ParseStates that reflect the
 * state before the substitution has been parsed; this is a copy of the
 * parents' state, which is after the parsing.
 */
typedef struct ParseState {
	unsigned char pos;	/* Position on the line */
	unsigned char flags;
	word16 crc16;
	word32 crc32;
	word32 check;
} ParseState;

typedef struct ParseNode {
	HeapCost cost;
	unsigned int refcnt;
	struct ParseNode *parent;
	char const *input;
	struct Substitution const *subst;
	struct ParseState ps;
} ParseNode;

/* A handle for walking backwards through the output stream */
typedef struct OutputHandle {
	ParseNode const *node;
	char const *output;
	unsigned int pos;
} OutputHandle;

/* Initialize the handle to point to a node (optionally, a position therein) */
static void
OutputInit(OutputHandle *oh, ParseNode const *node, char const *p)
{
		oh->node = node;
		oh->output = p ? p : node->subst->output + node->subst->outlen;
		oh->pos = 0;
}

/* Get the *previous* byte */
static int
OutputGetPrev(OutputHandle *oh)
{
	if (!oh->node)
		return -1;
	for (;;) {
		if (oh->output != oh->node->subst->output) {
			oh->pos++;
			return *--oh->output & 255;
		}
		oh->node = oh->node->parent;
		if (!oh->node)
			break;
		oh->output = oh->node->subst->output + oh->node->subst->outlen;
	}
	return -1;
}

/*
 * Unget the last retrieved character (and return it), or
 * -1 if that is impossible.  At least one character is
 * always ungettable, but after that you're on your own.
 */
static int
OutputUnget(OutputHandle *oh)
{
	if (oh->node && *oh->output) {
		oh->pos--;
		return *oh->output++ & 255;
	}
	return -1;
}

/* The position is useful for comparing two OutputHAndles. */
#define OutputPos(oh) ((oh)->pos)

/*
 * Fill backwards from bufend until you hit the given char.
 * Use -1 to get the whole buffer.
 */
static char *
OutputGetUntil(OutputHandle *oh, char *bufend, int end)
{
	int c;

	for (;;) {
		c = OutputGetPrev(oh);
		if (c == -1 || c == end)
			break;
		*--bufend = (char)c;
	}
	if (c != -1)
		OutputUnget(oh);
	return bufend;
}

/*
 * The per-page structure.  This is actually global, but describes
 * the values kept for each page processed.
 */
typedef struct PerPage {
	char const *maxpos;
	HeapCost mincost;
	word32 crc32;
	unsigned int tabsize;
	unsigned int lines;
} PerPage;

PerPage perpage;	/* The global */

static void
PerPageInit(char const *buf)
{
	perpage.maxpos = buf;
	perpage.mincost = 0;
	perpage.tabsize = 4;
	perpage.lines = 0;
}

HeapCost
TabFilter(struct ParseNode *parent, char const *limit,
	struct Substitution const *subst)
{
	int excess;

	if (limit-parent->input < 3 || !perpage.tabsize)
		return COST_INFINITY;	/* No interest */

	/* How wide should the tab be? */
	excess = (int)((parent->ps.pos-PREFIX_LENGTH) % perpage.tabsize);
	excess = (int)perpage.tabsize - excess;
	/* How much wider than that is the output */
	excess = (int)subst->outlen - excess;
	if (excess < 0)
		return COST_INFINITY;

	/* The exact right number is preferred. */
	if (!excess)
		return 1;

	/*
	 * If it is followed by more whitespace, only add one extra space,
	 * and that, infrequently.
	 */
	if (parent->input[2] == TAB_CHAR || parent->input[2] == ' ')
		return (excess > 1 ? COST_INFINITY : 15);

	/* Otherwise, a slowly increasing function of the excess. */
	return 10 + excess-1;
}


/* Manage a *big* pool of ParseNodes */

struct MemPool nodePool;
struct ParseNode *nodeFreeList = 0;

/* Prepare for node allocations */
static void
NodePoolInit(void)
{
	memPoolInit(&nodePool);
	nodeFreeList = NULL;
}

/* Free all nodes in one swell foop */
static void
NodePoolCleanup(void)
{
	nodeFreeList = NULL;
	memPoolEmpty(&nodePool);
}

/* Allcoate a new (uninitialized) node */
static struct ParseNode *
NodeAlloc(void)
{
	struct ParseNode *node;

	node = nodeFreeList;
	if (node) {
		nodeFreeList = node->parent;
		return node;
	}
	return memPoolNew(&nodePool, ParseNode);
}

/* Free a node for reallocation */
static void
NodeFree(struct ParseNode *node)
{
	node->parent = nodeFreeList;
	nodeFreeList = node;
}

/*
 * Decrement a node's reference count, freeing it and
 * recursively decrementing its parent's if the count
 * goes to zero.
 */
static void
NodeRelease(struct ParseNode *node)
{
	struct ParseNode *parent;
	assert(node->refcnt);

	while (!--node->refcnt) {
		parent = node->parent;
		NodeFree(node);
		if (!parent)
			break;
		node = parent;
	}
}


/* Add nodes to the substitution tree */

/* Create a child of the given node, with the given properties. */
static ParseNode *
AddChild(ParseNode *parent, Heap *heap, Substitution const *subst,
	HeapCost cost)
{
	ParseNode *child;

	if (cost == COST_INFINITY)
		return 0;

	cost += parent->cost;
	if (cost <= perpage.mincost + MAX_BACK_COST) {
		child = NodeAlloc();
		*child = *parent;
		/* Child is just like parent, except... */
		child->cost = cost;
		child->refcnt = 1;	/* The heap */
		child->input += subst->inlen;
		child->subst = subst;
		child->parent = parent;
		parent->refcnt++;
		HeapInsert(heap, &child->cost);
	} else {
		static unsigned int count = 0, limit = 0;
		if (++count > limit) {
			limit += limit+1;
			printf("Debug: Cost limit reached %u times.\n", count);
		}
		child = 0;
	}
	return child;
}

/*
 * Form all of a ParseNode's children and add them to the heap.
 * Limit is the limit of allowable lookahead.
 */
static void
AddChildren(ParseNode *parent, Heap *heap, char const *limit)
{
	char c = parent->input[0];
	Substitution *subst = substitutions[0][c & 255];
	HeapCost cost;

/* If you want to make pure insertion substitutions, do that here */

	assert(parent->input < limit);	/* We always have at least one char */

	if (!subst) {
		fprintf(stderr, "Fatal error: Illegal character %u ('%c') in input!\n",
		        c & 255, c);
		exit(1);
	}
	do {
		if (subst->inlen == 1 ||	/* Easy case */
			((size_t)(limit-parent->input) >= subst->inlen &&
			 memcmp(subst->input, parent->input, subst->inlen) == 0))
		{
			cost = subst->cost;
			if (subst->filter)
				cost = subst->filter(parent, limit, subst);
			(void)AddChild(parent, heap, subst, cost);
		}
	} while ((subst = subst->next) != NULL);

	/* TODO: Now play with the tab stops... */
}


/*
 * Parsing operations.  This is a rather ugly and ad-hoc parser that
 * knows a lot about the fixed-field format produced by the munge
 * utility.  The main state variable is the position in
 * the line, which controls the expected header, the position of
 * tab stops, and the maximum permissible line length.
 */
#define PS_FLAG_TAB 128		/* Tabbing over a column */
#define PS_FLAG_INHEADER 64	/* Current line is a header */
#define PS_FLAG_PASTHEADER 32	/* A previous line was a header */
#define PS_FLAG_BINWS 16	/* In whitespace after binary data */
#define PS_FLAG_BINEND 8	/* End of binary data */
#define PS_FLAG_DYNAMIC 4	/* Have used ECC this line */
#define PS_FLAG_PAGENUM 3	/* Length of header page number (1-3) */

/* Have we started on a second page?  Used to force flushing of the first. */
#define InSecondHeader(ps) \
	((~(ps)->flags & (PS_FLAG_INHEADER | PS_FLAG_PASTHEADER)) == 0)
#define PageNumDigits(pn) ((pn)->ps.flags & PS_FLAG_PAGENUM)
#define PageNumDigitsIncrement(pn) ((pn)->ps.flags++)

/* TODO: make these variables */
#define TABWIDTH 4	/* For now */
#define BINARY_MODE 0 /* For now */

char radix64Decode[0x100];
#define isradix64(c) radix64Decode[(c) & 255]

static void
Radix64Init(void)
{
	int i;

	for (i = 0; i < (int)elemsof(radix64Decode); i++)
		radix64Decode[i] = 0;
	for (i = 0; i < 64; i++)
		radix64Decode[radix64Digits[i] & 255] = i + 128;
}

/* Set up a ParseState to top-of-page */
static void
ParseStateInit(ParseState *ps)
{
	static struct ParseState const parseNull = { 0, 0, 0, 0, 0 };
	*ps = parseNull;
}

/* Debugging utility */
static void
OverstrikeLine(char const *line, size_t len)
{
#if 1	/* One-line status update */
	static size_t lastlen = 0;
	int blanklen;

	if (!line) {
		if (lastlen)
			putchar('\n');
		lastlen = 0;
	} else if (len || lastlen) {
		if (len > 79)
			len = 79;
		blanklen = (lastlen > len) ? (int)lastlen - len : 0;
		printf("%.*s%*s\r", (int)len, line, blanklen, "");
		fflush(stdout);
		lastlen = len;
	}
#else	/* Print everything, for debugging */
	if (line)
		printf("%.*s\n", (int)len, line);
#endif
}

/*
 * This function walks back through the line, and if the CRC-16 could be
 * made correct by changing a character to another legal character,
 * the change is added (on probation) to the substitution table.
 *
 * This function is too loose - it needs to be tightened up.  It is
 * prone to making false corrections on header lines.  Also, it
 * can't correct checksum digits.  That all can be fixed, it just
 * hasn't been done yet.
 */
static void
ErrorCorrect(Heap *heap, OutputHandle oh, word16 syndrome)
{
	ParseNode const *node;
	ParseNode *pn;
	Substitution const *subst;
	int c, d;

	while ((c = OutputGetPrev(&oh)) != '\n' && c != -1) {
		syndrome = ReverseCRC16(syndrome, 0);
		if (syndrome >= 0x100)
			continue;
		d = c ^ syndrome;	/* Char it's changed to */
		if (!substitutions[0][d])
			continue;
		node = oh.node;
		if (node->subst->inlen != 1)
			continue;
		if (strcmp(node->subst->input, node->subst->output) != 0)
			continue;

		/* Hey, we have a contender! */
		/*printf("Dynamic %s->%c\n", node->subst->input, c^syndrome);*/
		subst = SubstDynamic(node->subst->input, c ^ syndrome, 0);
		pn = AddChild(node->parent, heap, subst, DYNAMIC_COST_GUESSED);
		if (pn)
			pn->ps.flags |= PS_FLAG_DYNAMIC;
		/* Let the normal engine take it from here */
	}
}

/*
 * Convert an lowercase hex string to binary.
 * No error checking is performed.
 */
static word32
GetWord32(char const *buf, int len)
{
	word32 w = 0;
	int i;

	for (i = 0; i < len; i++)
		w = (w<<4) + buf[i] - (buf[i] > '9' ? 'a'-10 : '0');
	return w;
}

static int
ParseNewline(Heap *heap, ParseNode *pn, char const *string)
{
	OutputHandle oh;
	int c;
	char debugbuf[PREFIX_LENGTH+LINE_LENGTH+10];
	char *debugp1, *debugp2, *debugp3;
	int pos;
	word16 syndrome;
	word32 crc32;
	ParseNode *temp;

	/* Back up over trailing whitespace */
	OutputInit(&oh, pn, string);
	pos = pn->ps.pos;	/* But not the initial whitespace */
	while (pos > PREFIX_LENGTH) {
		c = OutputGetPrev(&oh);
		if (!isspace(c)) {
			OutputUnget(&oh);
			break;
		}
		pn->ps.crc16 = ReverseCRC16(pn->ps.crc16, (byte)c);
		pos--;
	}

	/* Checksum the newline and see if it works... */
	syndrome = pn->ps.crc16 ^ (word16)pn->ps.check;
	pn->ps.crc16 = AdvanceCRC16(pn->ps.crc16, '\n');
	syndrome = pn->ps.crc16 ^ (word16)pn->ps.check;
	if (syndrome) {
		/* If we haven't already tried ECC on the line... */
		if (!(pn->ps.flags & PS_FLAG_DYNAMIC))
			ErrorCorrect(heap, oh, ReverseCRC16(syndrome, 0));
		return COST_INFINITY;
	}

	/* Get the line into a buffer for analysis */
	debugp2 = debugbuf + sizeof(debugbuf)-1;
	debugp1 = OutputGetUntil(&oh, debugp2, '\n');
	/* Strip leading and trailing whitespace */
	while (debugp1 < debugp2 && isspace((unsigned char)debugp1[0]))
		debugp1++;
	while (debugp1 < debugp2 && isspace((unsigned char)debugp2[-1]))
		debugp2--;
	*debugp2++ = '\n';

	if (pn->ps.flags & PS_FLAG_INHEADER) {
		/* Do things for first header */
		if (!(pn->ps.flags & PS_FLAG_PASTHEADER)) {
			/* Check version number */
			if (debugp1[PREFIX_LENGTH+1] != '0') {
				fputs("Fatal: you need a newer version of repair"
				      " to process this file\n", stderr);
				exit(1);
			}
			/* Suck in CRC, after version & flags */
			perpage.crc32 = GetWord32(debugp1+PREFIX_LENGTH+3, 8);
			perpage.tabsize = GetWord32(debugp1+PREFIX_LENGTH+3+8, 1);
			/* Get tab size */
			/* Once we have the header, don't reconsider */
			if (!(pn->ps.flags & PS_FLAG_PASTHEADER))
				while ((temp = (ParseNode *)HeapGetMin(heap)) != NULL)
					NodeRelease(temp);
			pn->ps.crc32 = 0;	/* Clear for top of page */
		}
	} else {
		/* Start of checksummed area */
		debugp3 = debugp1 + PREFIX_LENGTH;
		debugp3 -= (debugp3 >= debugp2);
		crc32 = CalculateCRC32(pn->ps.crc32, debugp3, debugp2-debugp3);
		if (crc32 >> 24 != pn->ps.check >> 16)
			return COST_INFINITY;
		pn->ps.crc32 = crc32;
	}

	/* Debugging */
	OverstrikeLine(debugp1, debugp2-debugp1-1);

	/* Start next line */
	pn->ps.pos = 0;
	/* Clear most other flags, but we *have* got a header */
	pn->ps.flags &= PS_FLAG_BINEND;
	pn->ps.flags |= PS_FLAG_PASTHEADER;
	return 0;
}

/*
 * Advance the parse state with pointed-to character.  Returns
 * COST_INFINITY if an impossible state is reached, otherwise returns a
 * cost value.  (Normally 0, this can be increased to penalize unlikely
 * output combinations to nudge the correction in a certain direction.)
 */
static HeapCost
ParseAdvance(Heap *heap, ParseNode *pn, char const *string)
{
	int retval = 0;
	char c = *string;

	switch (pn->ps.pos) {
		case 0:
			if (c == ' ' || c == '\n') {
				break;		/* Ignore ws and blank lines completely */
			} else if (c == '\f' || c == '-') {
				/* Start of a new page */
				pn->ps.flags |= PS_FLAG_INHEADER;	/* Expect header next */
				if (c == '\f')
					break;
				else
					pn->ps.check = 0;
			} else if (pn->ps.flags & PS_FLAG_INHEADER) {
				return COST_INFINITY;	/* Illegal */
			} else if (pn->ps.flags & PS_FLAG_BINEND) {
				return COST_INFINITY;	/* Illegal */
			} else if (!(pn->ps.flags & PS_FLAG_PASTHEADER)) {
				return COST_INFINITY;	/* Illegal */
			} else if (c >= '0' && c <= '9') {
				pn->ps.check = c - '0';
			} else if (c >= 'a' && c <= 'f') {
				pn->ps.check = c - ('a' - 10);
			} else {
				return COST_INFINITY;	/* Illegal */
			}
			pn->ps.pos++;
			break;
		case 1:
			if ((pn->ps.flags & PS_FLAG_INHEADER)) {
				if (c != '-')	/* Second char of header */
					return COST_INFINITY;
				pn->ps.pos++;
				break;
			}
			/*FALLTHROUGH*/
		case 2:
		case 3:
		case 4:
#if PREFIX_LENGTH != 7
#error fix this code
#endif
		case PREFIX_LENGTH-2:
			if (c >= '0' && c <= '9') {
				pn->ps.check = (pn->ps.check << 4) + c - '0';
			} else if (c >= 'a' && c <= 'f') {
				pn->ps.check = (pn->ps.check << 4) + c - ('a' - 10);
			} else {
				return COST_INFINITY;
			}
			pn->ps.pos++;
			break;
		case PREFIX_LENGTH-1:
			pn->ps.crc16 = 0;
			if (c == ' ') {
				pn->ps.pos++;
				break;
			} else if (c != '\n') {
				return COST_INFINITY;
			}
			/* Blank lines may be missing this space char */
			/*FALLTHROUGH*/
		/* The normal line starts here, at position 7 */
		default:
			if (pn->ps.flags & PS_FLAG_INHEADER) {	/* Header line */
				/* Format is "( abcd 0123456789abcdef012 Page %u of %s )" */
				int off = pn->ps.pos - (PREFIX_LENGTH+HDR_HEX_LENGTH);
				/* Offset relative to end of hex header */
				if (off < 0) {
					if ((c < '0' || c > '9') && (c < 'a' || c > 'f'))
						return COST_INFINITY;
				} else if (off < 6) {
					if (c != " Page "[off])	/* Yes, this is legal C */
						return COST_INFINITY;
				} else if (off == 6) {
					if (c < '1' || c > '9')	/* First digit of page no. */
						return COST_INFINITY;
				} else {
					/* Re-base to end of scanned part of page number */
					off -= 7 + PageNumDigits(pn);
					if (off == 0) {
						if (c >= '0' && c <= '9' && PageNumDigits(pn) < 3)
							PageNumDigitsIncrement(pn);
						else if (c != ' ')
							return COST_INFINITY;
					} else if (off < 4) {
						if (c != " of "[off])
							return COST_INFINITY;
					} else if (off == 4) {
						if (!isgraph(c))
							return COST_INFINITY;
					} else if (c < ' ' || (c & 255) > '~') {
						if (c != '\n')
							return COST_INFINITY;
						return ParseNewline(heap, pn, string);
					}
				}
			} else if (!perpage.tabsize) {	/* Radix-64 line */
				/* Line is "RlNFVF9UQU==   \n" */
				if (isspace(c & 255)) {
					if (!(pn->ps.flags & PS_FLAG_BINWS)) {
						if ((pn->ps.pos - PREFIX_LENGTH) % 4 != 0)
							return COST_INFINITY;
						pn->ps.flags |= PS_FLAG_BINWS;
						if (pn->ps.pos - PREFIX_LENGTH < BYTES_PER_LINE*4/3)
							pn->ps.flags |= PS_FLAG_BINEND;
					}
					if (c == '\n')
						return ParseNewline(heap, pn, string);
				} else if (pn->ps.flags & PS_FLAG_BINWS) {
					return COST_INFINITY;
				} else if (c == '=') {
					if ((pn->ps.pos - PREFIX_LENGTH) % 4 < 2)
						return COST_INFINITY;
					pn->ps.flags |= PS_FLAG_BINEND;
				} else if (pn->ps.flags & PS_FLAG_BINEND) {
					return COST_INFINITY;
				} else if (!isradix64(c)) {
					return COST_INFINITY;
				}
			} else {	/* Normal line */
				/* Make sure tab stops come out right */
				if (pn->ps.flags & PS_FLAG_TAB) {
					if (((pn->ps.pos - PREFIX_LENGTH) % perpage.tabsize) == 0)
						pn->ps.flags &= ~PS_FLAG_TAB;
					else if (c != TAB_PAD_CHAR && c != '\n') {
						return COST_INFINITY;	/* Illegal */
					}
				}
				/*
				 * Yes, this code has hard-coded ASCII assumptions
				 * It knows that the acceptable range of '\n', ' '..'~',
				 * TAB_CHAR, FORMFEED_CHAR is in that order.
				 * Signed char machines have it backwards, to be confusing.
				 */
				if ((c & 255) < ' ') {
					/* Newline! (Or something illegal) */
					if (c != '\n')
						return COST_INFINITY;
					return ParseNewline(heap, pn, string);
				}
				/* A normal character */
				if (c > '~') {
					if (pn->ps.flags & PS_FLAG_INHEADER)
						return COST_INFINITY;	/* Illegal */
					if (c == TAB_CHAR) {
						pn->ps.flags |= PS_FLAG_TAB;
					} else if (c != FORMFEED_CHAR) {
						return COST_INFINITY;	/* Illegal */
					}
				}
			}
			if (++pn->ps.pos > PREFIX_LENGTH + LINE_LENGTH)
				return COST_INFINITY;
			pn->ps.crc16 = AdvanceCRC16(pn->ps.crc16, c);
			break;
	}
	return retval;
}

/*
 * Run the parser over the string in a ParseNode (using repeated calls
 * to ParseAdvance).  Return the penalty cost, or COST_INFINITY if
 * it's impossible
 */
static HeapCost
ParseAdvanceString(Heap *heap, ParseNode *pn)
{
	HeapCost cost, total = 0;
	char const *string = pn->subst->output;

	while (*string) {
		cost = ParseAdvance(heap, pn, string++);
		if (cost == COST_INFINITY)
			return cost;
		total += cost;
	}
	return total;
}

unsigned int *globalStats = NULL;
unsigned globalSize = 0;

/*
 * This walks the list of substitutions, performing two tasks with
 * the statistics gathered.
 *
 * First, although not essential, it prints any interesting changes
 * (non-identity substitutions) made, and a count of the total number
 * of substitutions (including identity) as an approximate character count.
 *
 * Second, it does maintenance on dynamic (learned during program
 * execution) substitutions.  It discards any substitutions that end
 * up unused, and computes nice costs for the others, based on the
 * global (per-file) statistics.
 *
 * (This function is also called at the end to print the per-file stats,
 * which does redundant weight adjustment, but it's harmless.)
 */
static void
UseStats(unsigned int *stats)
{
	unsigned int i, j, n, total = 0;
	unsigned long grand = 0;
	Substitution *s, **sp;

	/* Yes, this loop is permuted on purpose */
	if (stats) {
		for (j = 0; j < elemsof(*substitutions); j++) {
			for (i = 0; i < elemsof(substitutions); i++) {
				sp = &substitutions[i][j];
				while ((s = *sp) != 0) {
					grand += n = stats[s->index];
					if (SubstIsDynamic(s)) {
						if (n) {
							s->cost = DYNAMIC_COST_LEARNED;
						} else if (!globalStats[s->index]) {
							/* Forget unused dynamic substitutions */
							*sp = s->next;
							SubstFree(s);
							continue;
						}
					}
					if (n && strcmp(s->input, s->output) != 0) {
						total += n;
						printf("\t\"%s\" -> \"%s\" (x %u)%s\n",
							   s->input, s->output, stats[s->index],
							   SubstIsDynamic(s) ? " ** LEARNED **" : "");
					}
					sp = &s->next;
				}
			}
		}
	}
	printf("\tTotal: %u changes (out of %lu)\n", total, grand);
}

static void
DoStats(ParseNode const *node, unsigned int page)
{
	unsigned int *stats;
	unsigned int n;

	/* Enlarge global stats if needed */
	if (globalSize < substCount) {
		stats = realloc(globalStats, substCount * sizeof(*stats));
		if (!stats)  {
			fputs("Fatal error: out of memory for stats!\n", stderr);
			exit(1);
		}
		for (n = globalSize; n < substCount; n++)
			stats[n] = 0;
		globalStats = stats;
		globalSize = substCount;
	}

	/* Allocate per-page stats */
	stats = calloc(substCount, sizeof(*stats));
	if (!stats) {
		fputs("Fatal error: out of memory for stats!\n", stderr);
		exit(1);
	}
	/* Cheat and assume that calloc() initializes unsigned ints to zero */
	while (node) {
		stats[node->subst->index]++;
		node = node->parent;
	}

	/* Keep the global counts accurate */
	for (n = 0; n < substCount; n++)
		globalStats[n] += stats[n];

	printf("Page %u substitutions:\n", page);
	UseStats(stats);

	free(stats);
}

/* Spit out a page of data (needs work).  Returns number of lines */
static unsigned
PrintPage(OutputHandle *oh, FILE *out, unsigned int page)
{
	char pagebuf[PAGE_BUFFER_SIZE];
	char *p1;	/* Beginning of current line */
	char *p2;	/* End of current line (WS stripped) */
	char *p3;	/* End of current line (newline) */
	char *p4;	/* End of all output */
	unsigned lines = 0;

	DoStats(oh->node, page);

	p4 = pagebuf + sizeof(pagebuf);
	p1 = OutputGetUntil(oh, p4, -1);

	/* Output the lines without leading & trailing whitespace */
	while (p1 < p4) {
		/* Identify the line */
		p3 = memchr(p1, '\n', p4-p1);
		if (!p3)
			p3 = p4;
		/* Delete leading whitespacee */
		while (isspace((unsigned char)*p1) && p1 < p3)
			p1++;
		/* Delete trailing whitepace */
		p2 = p3;
		while (isspace((unsigned char)p2[-1]) && p1 < p2)
			p2--;
		/* Spit out this line */
		fwrite(p1, 1, (size_t)(p2-p1), out);
		putc('\n', out);
		/* Advance p1 past the newline */
		p1 = p3 + 1;
		lines++;
	}
	return lines;
}

/*
 * Given a buffer, process a page from it.  Return the number of bytes
 * accessed.
 */
static size_t
DoPage(char const *buf, size_t len, FILE *out, unsigned int page)
{
	ParseNode *node;
	Heap heap;
	HeapCost cost;
	OutputHandle oh;

	HeapInit(&heap, 1000);

	NodePoolInit();
	PerPageInit(buf);

	node = NodeAlloc();
	node->cost = 0;
	node->refcnt = 1;
	node->input = buf;
	node->subst = &substNull;
	ParseStateInit(&node->ps);
	node->parent = NULL;

	HeapInsert(&heap, &node->cost);

	while ((node = (ParseNode *)HeapGetMin(&heap)) != NULL) {
		if (node->input + MAX_BACK_CHARS >= perpage.maxpos) {
			cost = ParseAdvanceString(&heap, node);
			if (cost != COST_INFINITY) {
				/* End of the file, or hit a second header line? */
				if (node->input == buf+len || InSecondHeader(&node->ps)) {
					/* Try to wrap up page, if crc32 works */
					if (node->ps.crc32 == perpage.crc32) {
						/* Success! */
						HeapDestroy(&heap);
						OutputInit(&oh, node, NULL);
						OverstrikeLine("", 0);

						if (InSecondHeader(&node->ps)) {
							/* Back up to last newline */
							OutputInit(&oh, node, NULL);
							while (OutputGetPrev(&oh) != '\n')
								;
							OutputUnget(&oh);
						}
						/* oh points to node that emitted last char on page */
						len = oh.node->input - buf; /* Chars eaten this page */
						perpage.lines = PrintPage(&oh, out, page);
						return len;
					}
				} else {
					/* Keep working on the page */
					node->cost = cost += node->cost;
					if (cost < perpage.mincost)
						perpage.mincost = cost;
					if (node->input > perpage.maxpos)
						perpage.maxpos = node->input;
					AddChildren(node, &heap, buf+len);
				}
			}
		} else {
			static unsigned int count = 0, limit = 0;
			if (++count > limit) {
				limit += limit+1;
				printf("Debug: Distance limit reached %u times.\n", count);
			}
		}
		NodeRelease(node);
	}
	OverstrikeLine(NULL, 0);
	puts("Page failed");
	/* Failed! */

	HeapDestroy(&heap);

	return 0;
}

static int
RepairFile(char const *name, char const *editor)
{
	char buf[PAGE_BUFFER_SIZE];
	char *buf2;
	FILE *in, *out;
	char const *inname = name;
	char const *edit;	/* $VISUAL or $EDITOR */
	size_t done;		/* Bytes procesed by DoPage */
	size_t inbytes;		/* Bytes in input buffer */
	unsigned int pages;	/* # of pages processed */
	unsigned int lines;	/* # of lines processed (until trouble) */
	int giveup;			/* Have we had to abort corrections? */
	int err = 0;		/* Copy of errno for returns */

retry:
	pages = 0;
	lines = 0;
	giveup = 0;

	strcpy(buf, name);
	strcat(buf, ".out");

	in = fopen(inname, "rt");
	if (!in) {
		err = errno;
		fprintf(stderr, "Uname to open input file \"%s\"\n", name);
		return err;
	}
	out = fopen(buf, "wt");
	if (!out) {
		err = errno;
		fclose(in);
		fprintf(stderr, "Unable to open output file \"%s\"\n", buf);
		return err;
	}
	printf("Repairing %s -> %s\n", inname, buf);

	globalSize = 0;	/* Reset global stats */
	inbytes = 0;	/* Bytes already at the front of the buffer */
	for (;;) {
		/* Append more data from the file */
		inbytes += fread(buf+inbytes, 1, sizeof(buf)-inbytes, in);
		if (!inbytes)
			break;
		if (giveup) {
			/* Giving up mode - just copy through */
			done = fwrite(buf, 1, inbytes, out);
			if (!done) {
				err = errno;
				fputs("Error writing output file!\n", stderr);
				fclose(in);
				fclose(out);
				return err;
			}
		} else {
			done = DoPage(buf, inbytes, out, pages+1);
			NodePoolCleanup();
			if (done) {
				pages++;
				lines += perpage.lines;
			} else {
				char const *p;
				/* Count lines until furthest position reached */
				for (p = buf;  p < perpage.maxpos; p++) {
					if (*p == '\n')
						lines++;
				}
				giveup = 1;
			}
		}
		/* Fewer bytes now in the buffer */
		inbytes -= done;
		/* Move those bytes to the front again */
		memmove(buf, buf+done, inbytes);
	}
	fclose(in);
	fclose(out);
	printf("Overall substitutions (%u pages):\n", pages);
	UseStats(globalStats);

	if (!giveup) {
		printf("\n%u lines successfully processed.\n", lines);
		return 0;
	}

	printf("\n### MANUAL CORRECTION NEEDED ON LINE %u ###\n", lines+1);

	/* Fire up editor and re-try */
	strcpy(buf, name);
	strcat(buf, ".out");
	buf2 = buf + sizeof(buf)/2;

	if (editor) {
		sprintf(buf2, editor, lines+1, buf);
	} else if ((edit=getenv("VISUAL")) != 0 || (edit=getenv("EDITOR")) != 0) {
		sprintf(buf2, "%s +%u %s\n", edit, lines+1, buf);
	} else {
		return 0;
	}

	if (system(buf2) != 0) {
		err = errno;
		fputs("Edit failed - aborting\n", stderr);
		return err;
	}
	strcpy(buf2, name);
	strcat(buf2, ".in");
	if (rename(buf, buf2) != 0)
		return errno;
	inname = buf2;
	goto retry;
}

/*
 * So - we have a variety of substitution-generating functions.
 * They have access to the current input, the current position in
 * the input, the previous output, and, most importantly, some
 * state.  How much state is required?  A pointer and an unsigned int?
 * or more?
 *
 * Simple substitutions -> one to whatever.
 * Complex (depend on context) ->
 *
 * Simple: indexed by input character, listed in increasing
 * order of cost.
 */

int
main(int argc, char *argv[])
{
	int		result = 0;
	int		i;
	char const *editor = NULL;

	InitCRC();
	SubstBuild();
	Radix64Init();

	/* Process leading flags */
	for (i = 1; i < argc && argv[i][0] == '-'; i++) {
		if (0 == strcmp(argv[i], "--")) {
			i++;
			break;
		}
		if (argv[i][1] == 'e') {
			if (argv[i][2])
				editor = argv[i]+2;
			else
				editor = argv[++i];
		} else {
			fprintf(stderr, "ERROR: Unrecognized option -%c\n", argv[i][1]);
			exit(1);
		}
	}

	/* Process files */
	for (; i < argc; i++) {
		result = RepairFile(argv[i], editor);
		if (result != 0) {
			fprintf(stderr, "Fatal error: %s\n", strerror(result));
			exit(1);
		}
	}

	return 0;
}

/*
 * Local Variables:
 * tab-width: 4
 * End:
 * vi: ts=4 sw=4
 * vim: si
 */
