// Legend Entertainment vocabulary file parser.
// Use to decompress and disassemble vocab.dat file from some Legend adventure
// games (Spellcasting 201 and 301, Eric the Unready, Gateway 1 and 2). This
// version does not work on two earliest games (Timequest and Spellcasting 101)
// which use uncompressed vocabulary. Those can be parsed by Volker Blasius's
// utility LEGEND1 which can be found in directory utilities on IF Archive site.
// Command line:
//   legend-vocab <vocabulary file> [-s201]
// Vocabulary file is always named vocab.dat. Flag -s201 is required for
// Spellcasting 201 which uses slightly different file format.
// This code is not particularly clean or reliable. Use it on your own risk.
// It may crash or hang if there are any errors in input file.
// The main purpose is to document and analyze the file format.
//
// Written by Vasyl Tsvirkunov (vasyl at pacbell dot net) 7/2005
// 


#include <stdio.h>
#include <io.h>
#include <string.h>

#include <vector>
using namespace std;

typedef unsigned short ushort;
typedef unsigned char byte;

ushort decomp(FILE* filehandle, ushort unitcount, ushort* huffmantable, ushort total, ushort uncompressed, byte* destination);

struct cWordData { ushort wordOffset; ushort flags; };

struct cWordData2 { ushort wordIndex; ushort flags; };

void main(int argc, char** argv)
{
// Call: decom-vocab vocab.dat [-s201]
//       The file is always called vocab.dat, flag s201 must be used only for Spellcasting 201.
//       This will not work for Timequest and Spellcasting 101 (non-compressed vocabularies).
	if(argc < 2)
		return;

	FILE* f = fopen(argv[1], "rb");

	bool bS201 = argc > 2 && stricmp(argv[2], "-s201") == 0;

	ushort nodecount;
	ushort huffmantable[1000];
	fread(&nodecount, 2, 1, f);
	fread(huffmantable, nodecount, 2, f);

	// For Spellcasting 201 the next two bytes is expected uncompressed stream size, for all others
	// it is compressed stream size. There is no easy way to detect this in runtime so it must be
	// specified in command line.
	ushort total;
	fread(&total, 2, 1, f);

	byte buffer[65536];
	ushort uncomptotal = bS201 ? decomp(f, nodecount, huffmantable, 0xffffu, total, buffer) :
								 decomp(f, nodecount, huffmantable, total, 0xffffu, buffer);
	buffer[uncomptotal] = 0;


	// The first table contains one entry per word, defines offset to the word in uncompressed stream and
	// some flags.
	ushort count;
	fread(&count, 2, 1, f);

	vector<cWordData> wordData;
	wordData.resize(count);
	fread(&wordData[0], count, 4, f);

	// Some words have more flags provided by sparse table.
	// The exact meaning of this count is unknown -- this is the only item besides flags that I don't know. See below.
	ushort count2;
	fread(&count2, 2, 1, f);

	// Couldn't find size of this table anywhere... It should be related to count2 but I cannot find the
	// exact nature of the relation: in all cases count3/count2 is slightly above 3 which does not make sense.
	ushort count3 = ushort((filelength(fileno(f))-ftell(f))/4);

	vector<cWordData> wordData2;
	wordData2.resize(count3);
	fread(&wordData2[0], count3, 4, f);

	fclose(f);

	for(int i=0; i<count; i++)
	{
		int where = -1;
		for(int j=0; j<count3; j++)
		{
			if(wordData2[j].wordOffset == i)
			{
				where = j; // happens only once in this table...
				break;
			}
		}

		if(where<0)
			printf("%04d: <%s> 0x%04x\n", i, buffer+wordData[i].wordOffset, wordData[i].flags);
		else
			printf("%04d: <%s> 0x%04x:0x%04x\n", i, buffer+wordData[i].wordOffset, wordData[i].flags, wordData2[where].flags);
	}

}


ushort decomp(FILE* filehandle, ushort unitcount, ushort* huffmantable, ushort total, ushort uncompressed, byte* destination)
{
	ushort uncompsize = 0;
	byte bitcounter = 0;
	ushort treecode;
	byte buffer;

	while(total)
	{
		treecode = unitcount-2; // root

internalloop:
		if(!total) continue;

		if(!bitcounter)
		{
			fread(&buffer, 1, 1, filehandle);
			total--;

			bitcounter = 8;
		}

		treecode = huffmantable[(buffer&1)|treecode]; // note: word table

		buffer >>= 1;

		bitcounter --;

		if((treecode & 0x8000) == 0) // internal node?
			goto internalloop;

		treecode = treecode^0xffff; // leaf
		if(treecode < 0x80)
		{
			*destination++ = byte(treecode);
			uncompsize++;
			if(uncompsize >= uncompressed)
				break;
		}
	}

	return uncompsize;
}
