/*
 * subst.c -- Repair substitution tables
 *
 * Copyright (C) 1997 Pretty Good Privacy, Inc.
 *
 * Written by Colin Plumb
 *
 * $Id: subst.c,v 1.2 1997/07/09 15:07:50 colin Exp $
 *
 * IT IS EXPECTED that users of this program will play with these tables
 * and the cost values in the subst.h header.  (Some day, they'll all
 * get moved to an external config file.)
 *
 * NOTE: Other cost are hiding in the TabFilter function.
 * Remember to keep them all on the same scale.
 */

/*
 * The repair program copies its input to its output, making various
 * substitutions, until it manages to produce a version that satisfies
 * the parser.  This includes having a correct CRC for each line.
 * Each substitution has a cost, and the combinations are tried in order
 * of increasing cost.  NOTE that even translating "A"->"A" counts as
 * a substitution, although it may have zero cost.
 *
 * The intention is to correct transcription errors, where the
 * errors have a distinctly non-uniform distribution.  Slight
 * differences in cost produce a preference in trying some errors
 * first.  If an error costs half as much as another, combinations
 * of two of that error will be compared to one of the more expensive.
 * Too many cheap substitutions will result is repair spending
 * a very log time searching before considering the more expensive
 * substitutions.
 *
 * The following parameters and the raw substitution tables are expected
 * to be edited by the user based on experience.  Eventually, this
 * will be moved into an external config file, but for now it's a matter
 * of recompiling.
 */

#include "subst.h"
#include "util.h"

/*
 * The input substitutions to make (one-to-one).   These are listed in
 * the order of correction. i.e. uncorrected input first, then corrected
 * output.  Substitutions are one-way; to get two-way, list it twice.
 */

struct RawSubst const substSingles[] = {
	{ " !\"#$%&'()*+,-./0123456789:;<=>?",
	  " !\"#$%&'()*+,-./0123456789:;<=>?", 0, NULL },
	{ "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_",
	  "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_", 0, NULL },
	{ "`abcdefghijklmnopqrstuvwxyz{|}~\f" TAB_STRING FORMFEED_STRING,
	  "`abcdefghijklmnopqrstuvwxyz{|}~\f" TAB_STRING FORMFEED_STRING, 0, NULL },
#if (TAB_PAD_CHAR & 128)	/* Not already included? */
	{ TAB_PAD_STRING, TAB_PAD_STRING, COST_LINE, NULL },
#endif
	{ "\n", "\n", COST_LINE, NULL },
	/* Common substitutions.  These costs should be fiddled */
	{ "-", "_", 1, NULL },	/* A *very* common error */
	{ "()[]", "[]{}", 5, NULL },
	{ "[]{}(){}", "()(){}[]", 10, NULL },
	{ "1l!", "|||", 10, NULL },
	{ "\"``,;;_g%SSOOLIIlIC27p",
	  "''\".:i-9X$5o0ll11[[Z?P", 10, NULL },
	{ "''\".:i-9X$5o0ll11[[Z?P",
	  "\"``,;;_g%SSOOLIIlIC27p", 10, NULL },
	/* Guessed errors, that might happen */
	{ "8B6G", "B8G6", 15, NULL },
	/* Some common insertion errors */
	{ ".,'`", NULL, 10, NULL },
	{ NULL, NULL, 0, NULL }
};

/* The many-to-many substitutions */
struct RawSubst const substMultiples[] = {
		{ "''", "\"", 10, NULL },
		{ "``", "\"", 10, NULL },
		{ " ", "  ", 15, NULL },
		{ "NIA", "MA", 9, NULL },
		{ "riM", "NM", 9, NULL },
		{ "\n", " */\n", 15, NULL },
	/* Tab-stop wonders */
	{ TAB_STRING" ", TAB_STRING"  ", 0, TabFilter },
	{ TAB_STRING" ", TAB_STRING"   ", 0, TabFilter },
	{ TAB_STRING" ", TAB_STRING"    ", 0, TabFilter },
	{ TAB_STRING" ", TAB_STRING"     ", 0, TabFilter },
	{ TAB_STRING" ", TAB_STRING"      ", 0, TabFilter },
	{ TAB_STRING" ", TAB_STRING"       ", 0, TabFilter },
	{ TAB_STRING" ", TAB_STRING"        ", 0, TabFilter },
	{ TAB_STRING" ", TAB_STRING"         ", 0, TabFilter },
	{ TAB_STRING" ", TAB_STRING"          ", 0, TabFilter },
	{ TAB_STRING" ", TAB_STRING"           ", 0, TabFilter },
	{ TAB_STRING" ", TAB_STRING"            ", 0, TabFilter },
	{ TAB_STRING" ", TAB_STRING"             ", 0, TabFilter },
	{ TAB_STRING" ", TAB_STRING"              ", 0, TabFilter },
	{ TAB_STRING" ", TAB_STRING"               ", 0, TabFilter },
	{ TAB_STRING" ", TAB_STRING"                ", 0, TabFilter },
	{ TAB_STRING" ", TAB_STRING"                 ", 0, TabFilter },
	{ TAB_STRING" ", TAB_STRING"                  ", 0, TabFilter },
	{ TAB_STRING" ", TAB_STRING"                   ", 0, TabFilter },
	{ TAB_STRING" ", TAB_STRING"                    ", 0, TabFilter },
#if TAB_PAD_CHAR != ' '
#error Fix those tab patterns!
#endif
	{ NULL, NULL, 0, NULL }
};
