// Legend Entertainment string file parser.
// Use to decompress and disassemble string file from all Legend adventure
// games from Timequest to Callahan's Crosstime Saloon.
// Command line:
//   legend-str <string file> [-version]
// String file always has extension .dat and the name ends in "str." For Eric
// the Unready it is ERICSTR.DAT, for Companions of Xanth it is XANTHSTR.DAT, etc.
// There are four different versions of file format. It is impossible for the
// utility to figure out exact version number so it has to be specified on
// the command line. If no number is specified, version 2 is assumed.
// Version 1 was used by two oldest games -- Spellcasting 101 and Timequest.
// Version 2 was used by the other two Spellcasting games, Eric the Unready, and
// the first Gateway game.
// Version 3 was used first in Gateway 2 and then continued through the rest
// of the games except CCS which is the only existing game to use version 4.
// I've tried multiple versions of the same games and did not find any
// discrepancies but that does not mean that none existed. If you find any games
// that did not match listed version numbers, let me know.
// This code is not particularly clean or reliable. Use it on your own risk.
// It may crash or hang if wrong version is specified or in case of any other
// error.
// The main purpose is to document and analyze the file format.
//
// Written by Vasyl Tsvirkunov (vasyl at pacbell dot net) 7/2005
// 


#include <stdio.h>
#include <io.h>

#include <vector>
using namespace std;

typedef unsigned short ushort;
typedef unsigned char byte;

ushort version = 2;

// Oldest header record format: S101 and TQ
struct cHeaderV1
{
	ushort uncStreamSize;
	ushort compStreamSize;
	ushort stringCount;
};

// Most common header record format: S201, S301, Eric, GW
struct cHeaderV2
{
	ushort stringCount;
	ushort streamSize;
};

// Newest header format: GW2, Xanth, Hoboken, Shannara, Mission Critical, Callahan's Crosstime Salloon, Quandaries
// Version 4 is used only by CCS.
struct cHeaderV34
{
	ushort stringCount;
	ushort streamSize;
	ushort flags; // ?
};


typedef vector<ushort> tHuffmanTree;
typedef vector<byte> tStream;


typedef vector<ushort> tSizeTable;

ushort decomp(FILE* filehandle, ushort unitcount, ushort* huffmantable, ushort total, byte* destination, ushort* dictionaryOffsets, byte* dictionary);

bool bOld = false;

void main(int argc, char** argv)
{
	if(argc < 2)
		return;

	if(argc > 2)
	{
		if(strcmp(argv[2], "-3") == 0)
			version = 3;
		else if(strcmp(argv[2], "-1") == 0)
			version = 1;
		else if(strcmp(argv[2], "-4") == 0)
			version = 4;
	}

	ushort s;
	FILE* f = fopen(argv[1], "rb");

	// Read header. It starts from "segment" count, then contains appropriate number
	// of entries describing each segment.
	ushort segmentCount;
	fread(&segmentCount, 2, 1, f);

	vector<ushort> stringCounts;
	stringCounts.resize(segmentCount);
	for(s=0; s<segmentCount; s++)
	{
		switch(version)
		{
		case 1:
			{
				cHeaderV1 h;
				fread(&h, 6, 1, f);
				stringCounts[s] = h.stringCount;
			}
			break;

		case 2:
			{
				cHeaderV2 h;
				fread(&h, 4, 1, f);
				stringCounts[s] = h.stringCount;
			}
			break;

		case 3:
		case 4:
			{
				cHeaderV34 h;
				fread(&h, 6, 1, f);
				stringCounts[s] = h.stringCount;
			}
			break;
		}
	}

	// Starting with version 2 there are tables of string sizes, one table per segment, one entry per string.
	// S101 and TQ don't have this table, strings are terminated implicitly.
	vector<tSizeTable> compressedTextSizeTables;
	compressedTextSizeTables.resize(segmentCount);

	for(s=0; s<segmentCount; s++)
	{
		compressedTextSizeTables[s].resize(stringCounts[s]);
		if(version == 1)
			memset(&compressedTextSizeTables[s][0], 0xff, stringCounts[s]*2);
		else
			fread(&compressedTextSizeTables[s][0], stringCounts[s], 2, f);
	}

	// Huffman tree.
	// Encoded as table of entry pairs. First two bytes is number of entries (version>=2) or pairs (version==1)
	ushort huffmantablesize;
	fread(&huffmantablesize, 2, 1, f);
	if(version == 1)
		huffmantablesize *= 2;
	tHuffmanTree huffmantable;
	huffmantable.resize(huffmantablesize);
	fread(&huffmantable[0], huffmantablesize, 2, f);

	// "Shorthand" table.
	ushort offsetsInShorthandsSize;
	fread(&offsetsInShorthandsSize, 2, 1, f);
	vector<ushort> offsetsInShorthands;
	offsetsInShorthands.resize(offsetsInShorthandsSize);
	fread(&offsetsInShorthands[0], offsetsInShorthandsSize, 2, f);

	ushort shorthandsize;
	fread(&shorthandsize, 2, 1, f);
	vector<byte> shorthand;
	shorthand.resize(shorthandsize);
	fread(&shorthand[0], shorthandsize, 1, f);

	// Encoded strings
	char buffer[65536];
	for(ushort i=0; i<segmentCount; i++)
	{
		for(ushort s=0; s<stringCounts[i]; s++)
		{
			ushort dsize = decomp(f, huffmantablesize, &huffmantable[0], compressedTextSizeTables[i][s], (byte*)buffer, &offsetsInShorthands[0], &shorthand[0]);
			buffer[dsize] = 0;

			printf("%02d:%03d: <%s>\n", i, s, buffer);
		}
	}
}




ushort decomp(FILE* filehandle, ushort unitcount, ushort* huffmantable, ushort total, byte* destination, ushort* dictionaryOffsets, byte* dictionary)
{
	ushort uncompsize = 0;
	byte bitcounter = 0;
	ushort treecode;
	byte buffer;
	bool escape = false; // used only in version 4

	while(total)
	{
		// In version 1 table root is the first element, in all other versions it is the last element (pair)
		if(version == 1)
			treecode = 0;
		else
			treecode = unitcount-2;

internalloop:
		if(!total) continue;

		if(!bitcounter)
		{
			fread(&buffer, 1, 1, filehandle);
			total--;

			bitcounter = 8;
		}

		treecode = huffmantable[(buffer&1)|treecode]; // note: word table

		buffer >>= 1;

		bitcounter --;

		// String termination by reference to root in S101 and TQ only
		if(version == 1 && treecode == 0)
			break;

		if((treecode & 0x8000) == 0) // internal node?
		{
			if(version == 1)
				treecode *= 2; // S101 and TQ format is slightly different. The number is pair index, not entry index.
			goto internalloop;
		}

		treecode = treecode^0xffff; // leaf
		// Here is another oddity from S101/TQ. This also means that the first entry in shorthand table
		// cannot be used.
		if(version == 1)
			treecode ++;

		if((version==4 && escape) || treecode >= 0x80) // shorthand?
		{
			if(version < 4)
				treecode -= 0x80; // older version used the second half for shorthands
			else
				escape = false;
			byte* runptr = &dictionary[dictionaryOffsets[treecode]];

			while(*runptr)
			{
				*destination++ = *runptr++;
				uncompsize ++;
			}
		}
		else if(version == 4 && treecode == 0x1a)
		{
			escape = true;
		}
		else
		{
			*destination++ = byte(treecode);
			uncompsize++;
		}
	}

	return uncompsize;
}
