#include "VS_2_0Assembler.hpp"

#include "Instruction.hpp"
#include "Error.hpp"
#include "Texture.hpp"
#include "VertexBuffer.hpp"
#include "XVertex.hpp"

#include <float.h>

namespace swShader
{
	using namespace SoftWire;

	float4 VS_2_0Assembler::v[16];
	float4 VS_2_0Assembler::r[16];
//	float4 VS_2_0Assembler::c[256];   // Defined in VertexShader
//	int4 VS_2_0Assembler::i[16];      // Defined in VertexShader
//	bool VS_2_0Assembler::b[16];      // Defined in VertexShader
	int VS_2_0Assembler::a0;
	int VS_2_0Assembler::aL;
	float4 VS_2_0Assembler::oD[2];
	float VS_2_0Assembler::oFog;
	float4 VS_2_0Assembler::oPos;
	float VS_2_0Assembler::oPts;
	float4 VS_2_0Assembler::oT[8];

	Operand VS_2_0Assembler::tmp0(Operand::INTERNAL_REGISTER, 0);
	Operand VS_2_0Assembler::tmp1(Operand::INTERNAL_REGISTER, 1);
	Operand VS_2_0Assembler::tmp2(Operand::INTERNAL_REGISTER, 2);
	Operand VS_2_0Assembler::tmp3(Operand::INTERNAL_REGISTER, 3);
	Operand VS_2_0Assembler::tmp4(Operand::INTERNAL_REGISTER, 4);
	Operand VS_2_0Assembler::tmp5(Operand::INTERNAL_REGISTER, 5);
	Operand VS_2_0Assembler::tmp6(Operand::INTERNAL_REGISTER, 6);
	Operand VS_2_0Assembler::tmp7(Operand::INTERNAL_REGISTER, 7);

	float4 VS_2_0Assembler::tmp[8];

	int VS_2_0Assembler::n;

	VS_2_0Assembler::VS_2_0Assembler()
	{
		intermediate = new Instruction();
		instruction = intermediate;

		position[0] = 0;
		position[1] = 0;
		blendWeight = 0;
		blendIndices = 0;
		normal[0] = 0;
		normal[1] = 0;
		pSize = 0;
		texCoord[0] = 0;
		texCoord[1] = 0;
		texCoord[2] = 0;
		texCoord[3] = 0;
		texCoord[4] = 0;
		texCoord[5] = 0;
		texCoord[6] = 0;
		texCoord[7] = 0;
		tangent = 0;
		binormal = 0;
		tessFactor = 0;
		positiont = 0;
		color[0] = 0;
		color[1] = 0;
		fog = 0;
		depth = 0;
		sample = 0;

		code = 0;

		n = 0;
		oFVF = 0;
	}

	VS_2_0Assembler::~VS_2_0Assembler()
	{
		delete intermediate;
		intermediate = 0;
	}

	void VS_2_0Assembler::process(int i)
	{
		if(!code)
		{
			encode();
			if(!code) throw INTERNAL_ERROR;
		}

		n = i * 4;

		code();
	}

	void VS_2_0Assembler::loadConstants()
	{
		if(!intermediate) throw INTERNAL_ERROR;
		Instruction *instruction = intermediate;

		while(instruction)
		{
			Operand &dest = instruction->destination;
			const Operand &fValue1 = instruction->source0;
			const Operand &fValue2 = instruction->source1;
			const Operand &fValue3 = instruction->source2;
			const Operand &fValue4 = instruction->source3;
			const Operand &integerValue1 = instruction->source0;
			const Operand &integerValue2 = instruction->source1;
			const Operand &integerValue3 = instruction->source2;
			const Operand &integerValue4 = instruction->source3;
			const Operand &booleanValue = instruction->source0;

			switch(instruction->mnemonic)
			{
			case Instruction::DEF:	DEF(dest, fValue1, fValue2, fValue3, fValue4);	break;
			case Instruction::DEFI:	DEFI(dest, integerValue1, integerValue2, integerValue3, integerValue4);	break;
			case Instruction::DEFB:	DEFB(dest, booleanValue);						break;
			}

			instruction = instruction->next;
		}
	}

	void VS_2_0Assembler::setPositionStream(const float4 *position, int usageIndex)
	{
		this->position[usageIndex] = position;
	}

	void VS_2_0Assembler::setBlendWeightStream(const float4 *blendWeight)
	{
		this->blendWeight = blendWeight;
	}

	void VS_2_0Assembler::setBlendIndicesStream(const byte4 *blendIndices)
	{
		this->blendIndices = blendIndices;
	}

	void VS_2_0Assembler::setNormalStream(const float4 *normal, int usageIndex)
	{
		this->normal[usageIndex] = normal;
	}

	void VS_2_0Assembler::setPSizeStream(const float *pSize)
	{
		this->pSize = pSize;
	}

	void VS_2_0Assembler::setTexCoordStream(const float4 *texCoord, int usageIndex)
	{
		this->texCoord[usageIndex] = texCoord;
	}

	void VS_2_0Assembler::setTangentStream(const float4 *tangent)
	{
		this->tangent = tangent;
	}

	void VS_2_0Assembler::setBinormalStream(const float4 *binormal)
	{
		this->binormal = binormal;
	}

	void VS_2_0Assembler::setTessFactorStream(const float *tessFactor)
	{
		this->tessFactor = tessFactor;
	}

	void VS_2_0Assembler::setPositiontStream(const float4 *positiont)
	{
		this->positiont = positiont;
	}

	void VS_2_0Assembler::setColorStream(const Color<byte> *color, int usageIndex)
	{
		this->color[usageIndex] = color;
	}

	void VS_2_0Assembler::setFogStream(const float *fog)
	{
		this->fog = fog;
	}

	void VS_2_0Assembler::setDepthStream(const float *depth)
	{
		this->depth = depth;
	}

	void VS_2_0Assembler::setSampleStream(const float4 *sample)
	{
		this->sample = sample;
	}

	void VS_2_0Assembler::setOutputVertex(XVertex *output)
	{
		this->output = output;
	}

	FVFFlags VS_2_0Assembler::getOutputFormat()
	{
		if(!code)
		{
			encode();
			if(!code) throw INTERNAL_ERROR;
		}

		return oFVF;
	}

	void VS_2_0Assembler::setMnemonic(Instruction::Mnemonic mnemonic)
	{
		instruction->mnemonic = mnemonic;
	}

	void VS_2_0Assembler::setModifier(Instruction::Modifier modifier)
	{
		instruction->modifier = modifier;
	}

	void VS_2_0Assembler::setDestination(const Operand &operand)
	{
		instruction->destination = operand;
	}

	void VS_2_0Assembler::setSource0(const Operand &operand)
	{
		instruction->source0 = operand;
	}

	void VS_2_0Assembler::setSource1(const Operand &operand)
	{
		instruction->source1 = operand;
	}

	void VS_2_0Assembler::setSource2(const Operand &operand)
	{
		instruction->source2 = operand;
	}

	void VS_2_0Assembler::setSource3(const Operand &operand)
	{
		instruction->source3 = operand;
	}

	void VS_2_0Assembler::newInstruction()
	{
		instruction = instruction->newNext();
	}

	void VS_2_0Assembler::encode()
	{
		#ifndef NDEBUG
			setEchoFile("VS_2_0Shader.asm");
		#endif

		for(int i = 0; i < 16; i++) vDcl[i] = false;

		try
		{
			pushad();
			freeAll();

			shader();

			writeOutput();

			emms();
			popad();
			ret();
		}
		catch(const Error &error)
		{
			throw Error("Fatal vertex shader assembler error: ") << error;
		}
		catch(...)
		{
			throw INTERNAL_ERROR;
		}

		code = finalize();
	}

	void VS_2_0Assembler::readInput(int i)
	{
		annotate("readInput(%d)", i);

		if(!input[i].buffer) throw INTERNAL_ERROR;

		static dword2 unpackl;
		static dword2 unpackh;

		static const int4 maskw = {-1, -1, -1, 0};
		static const int4 expand = {0, 0, 0, 0x3F800000};   // 1.0f

		static int source;

		switch(input[i].type)
		{
		case Stream::TYPE_INVALID:
			break;
		case Stream::TYPE_FLOAT1:
			mov(x32(&source), dword_ptr [input[i].buffer]);
			// Expand with y, z, w = 0, 0, 1
			movaps(x128(&v[i]), xword_ptr [&expand]);
			movss(x128(&v[i]), dword_ptr [r32(&source)+r32(&n)]);
			break;
		case Stream::TYPE_FLOAT2:
			mov(x32(&source), dword_ptr [input[i].buffer]);
			movaps(x128(&v[i]), xword_ptr [r32(&source)+4*r32(&n)]);
			// Expand with z, w = 0, 1
			movhps(x128(&v[i]), mmword_ptr [&expand+8]);
			break;
		case Stream::TYPE_FLOAT3:
			mov(x32(&source), dword_ptr [input[i].buffer]);
			movaps(x128(&v[i]), xword_ptr [r32(&source)+4*r32(&n)]);
			// Expand with w = 1
			andps(r128(&v[i]), xword_ptr[&maskw]);
			orps(r128(&v[i]), xword_ptr[&expand]);
			break;
		case Stream::TYPE_FLOAT4:
			mov(x32(&source), dword_ptr [input[i].buffer]);
			movaps(x128(&v[i]), xword_ptr [r32(&source)+4*r32(&n)]);
			break;
		case Stream::TYPE_UBYTE4:
		case Stream::TYPE_COLOR:
			mov(x32(&source), dword_ptr [input[i].buffer]);
			punpcklbw(x64(&unpackl), qword_ptr [r32(&source)+r32(&n)]);
			punpckhwd(x64(&unpackh), m64(&unpackl));
			punpcklwd(x64(&unpackl), m64(&unpackl));
			cvtpi2ps(x128(&v[i]), m64(&unpackh));		free(&unpackh);
			movlhps(x128(&v[i]), r128(&v[i]));
			cvtpi2ps(r128(&v[i]), m64(&unpackl));		free(&unpackl);
			break;
		default:
			throw INTERNAL_ERROR;
		}
	}

	void VS_2_0Assembler::shader()
	{
		annotate("shader()");

		if(!intermediate) throw INTERNAL_ERROR;
		Instruction *instruction = intermediate;

		if(instruction->mnemonic != Instruction::VS_2_0)throw Error("First shader instruction should be VS_2_0");

		while(instruction)
		{
			Operand &dst = instruction->destination;
			Operand &dest = instruction->destination;
			const Operand &src = instruction->source0;
			const Operand &src0 = instruction->source0;
			const Operand &src1 = instruction->source1;
			const Operand &src2 = instruction->source2;
			const Operand &src3 = instruction->source3;
			const Operand &fValue1 = instruction->source0;
			const Operand &fValue2 = instruction->source1;
			const Operand &fValue3 = instruction->source2;
			const Operand &fValue4 = instruction->source3;
			const Operand &integerValue1 = instruction->source0;
			const Operand &integerValue2 = instruction->source1;
			const Operand &integerValue3 = instruction->source2;
			const Operand &integerValue4 = instruction->source3;
			const Operand &booleanValue = instruction->source0;
			const Operand &boolRegister = instruction->source0;
			const Operand aL = instruction->destination;
			const Operand integerRegister = instruction->source0;
			const Operand &label = instruction->destination;

			switch(instruction->mnemonic)
			{
			case Instruction::VS_2_0:			VS_2_0();										break;

			case Instruction::DCL_POSITION0:	DCL_POSITION0(dest);							break;
			case Instruction::DCL_POSITION1:	DCL_POSITION1(dest);							break;
			case Instruction::DCL_BLENDWEIGHT:	DCL_BLENDWEIGHT(dest);							break;
			case Instruction::DCL_BLENDINDICES:	DCL_BLENDINDICES(dest);							break;
			case Instruction::DCL_NORMAL0:		DCL_NORMAL0(dest);								break;
			case Instruction::DCL_NORMAL1:		DCL_NORMAL1(dest);								break;
			case Instruction::DCL_PSIZE:		DCL_PSIZE(dest);								break;
			case Instruction::DCL_TEXCOORD0:	DCL_TEXCOORD0(dest);							break;
			case Instruction::DCL_TEXCOORD1:	DCL_TEXCOORD1(dest);							break;
			case Instruction::DCL_TEXCOORD2:	DCL_TEXCOORD2(dest);							break;
			case Instruction::DCL_TEXCOORD3:	DCL_TEXCOORD3(dest);							break;
			case Instruction::DCL_TEXCOORD4:	DCL_TEXCOORD4(dest);							break;
			case Instruction::DCL_TEXCOORD5:	DCL_TEXCOORD5(dest);							break;
			case Instruction::DCL_TEXCOORD6:	DCL_TEXCOORD6(dest);							break;
			case Instruction::DCL_TEXCOORD7:	DCL_TEXCOORD7(dest);							break;
			case Instruction::DCL_TANGENT:		DCL_TANGENT(dest);								break;
			case Instruction::DCL_BINORMAL:		DCL_BINORMAL(dest);								break;
			case Instruction::DCL_TESSFACTOR:	DCL_TESSFACTOR(dest);							break;
			case Instruction::DCL_POSITIONT:	DCL_POSITIONT(dest);							break;
			case Instruction::DCL_COLOR0:		DCL_COLOR0(dest);								break;
			case Instruction::DCL_COLOR1:		DCL_COLOR1(dest);								break;
			case Instruction::DCL_FOG:			DCL_FOG(dest);									break;
			case Instruction::DCL_DEPTH:		DCL_DEPTH(dest);								break;
			case Instruction::DCL_SAMPLE:		DCL_SAMPLE(dest);								break;
			case Instruction::DEF:				DEF(dest, fValue1, fValue2, fValue3, fValue4);	break;
			case Instruction::DEFI:				DEFI(dest, integerValue1, integerValue2, integerValue3, integerValue4);	break;
			case Instruction::DEFB:				DEFB(dest, booleanValue);						break;

			case Instruction::ABS:				ABS(dst, src);									break;
			case Instruction::ADD:				ADD(dst, src0, src1);							break;
			case Instruction::CRS:				CRS(dst, src0, src1);							break;
			case Instruction::DP3:				DP3(dst, src0, src1);							break;
			case Instruction::DP4:				DP4(dst, src0, src1);							break;
			case Instruction::EXP:				EXP(dst, src);									break;
			case Instruction::EXPP:				EXPP(dst, src);									break;
			case Instruction::FRC:				FRC(dst, src);									break;
			case Instruction::LIT:				LIT(dst, src);									break;
			case Instruction::LOG:				LOG(dst, src);									break;
			case Instruction::LOGP:				LOGP(dst, src);									break;
			case Instruction::LRP:				LRP(dst, src0, src1, src2);						break;
			case Instruction::M3X2:				M3X2(dst, src0, src1);							break;
			case Instruction::M3X3:				M3X3(dst, src0, src1);							break;
			case Instruction::M3X4:				M3X4(dst, src0, src1);							break;
			case Instruction::M4X3:				M4X3(dst, src0, src1);							break;
			case Instruction::M4X4:				M4X4(dst, src0, src1);							break;
			case Instruction::MAD:				MAD(dst, src0, src1, src2);						break;
			case Instruction::MAX:				MAX(dst, src0, src1);							break;
			case Instruction::MIN:				MIN(dst, src0, src1);							break;
			case Instruction::MOV:				MOV(dst, src);									break;
			case Instruction::MOVA:				MOVA(dst, src);									break;
			case Instruction::MUL:				MUL(dst, src0, src1);							break;
			case Instruction::NOP:				NOP();											break;
			case Instruction::NRM:				NRM(dst, src);									break;
			case Instruction::POW:				POW(dst, src0, src1);							break;
			case Instruction::RCP:				RCP(dst, src);									break;
			case Instruction::RSQ:				RSQ(dst, src);									break;
			case Instruction::SGE:				SGE(dst, src0, src1);							break;
			case Instruction::SGN:				SGN(dst, src0, src1, src2);						break;
			case Instruction::SINCOS:			SINCOS(dst, src0, src1, src2);					break;
			case Instruction::SUB:				SUB(dst, src0, src1);							break;

			case Instruction::CALL:				CALL(label);									break;
			case Instruction::CALLNZ:			CALLNZ(label, boolRegister);					break;
			case Instruction::ELSE:				ELSE();											break;
			case Instruction::ENDIF:			ENDIF();										break;
			case Instruction::ENDLOOP:			ENDLOOP();										break;
			case Instruction::ENDREP:			ENDREP();										break;
			case Instruction::IF:				IF(boolRegister);								break;
			case Instruction::LABEL:			LABEL(label);									break;
			case Instruction::LOOP:				LOOP(aL, integerRegister);						break;
			case Instruction::REP:				REP(integerRegister);							break;
			case Instruction::RET:				RET();											break;

			case Instruction::INVALID:															break;
			default:							throw INTERNAL_ERROR;
			}

			freeTemps();

			annotate("\n");

			instruction = instruction->next;
		}
	}

	void VS_2_0Assembler::writeOutput()
	{
		annotate("writeOutput()");

		const int position0Offset	= (int)&((XVertex*)0)->P;
	//	const int normal0Offset		= (int)&((XVertex*)0)->N;
		const int color0Offset		= (int)&((XVertex*)0)->C;
		const int color1Offset		= (int)&((XVertex*)0)->L;
		const int texcoord0Offset	= (int)&((XVertex*)0)->T[0];
		const int texcoord1Offset	= (int)&((XVertex*)0)->T[1];
		const int texcoord2Offset	= (int)&((XVertex*)0)->T[2];
		const int texcoord3Offset	= (int)&((XVertex*)0)->T[3];
		const int texcoord4Offset	= (int)&((XVertex*)0)->T[4];
		const int texcoord5Offset	= (int)&((XVertex*)0)->T[5];
		const int texcoord6Offset	= (int)&((XVertex*)0)->T[6];
		const int texcoord7Offset	= (int)&((XVertex*)0)->T[7];

								movups(xword_ptr [r32(&output)+position0Offset], r128(&oPos[0]));
		if(oFVF.hasDiffuse())	movups(xword_ptr [r32(&output)+color0Offset], r128(&oD[0]));
		if(oFVF.hasSpecular())	movups(xword_ptr [r32(&output)+color1Offset], r128(&oD[1]));
		if(oFVF.hasTexture(0))	movlps(qword_ptr [r32(&output)+texcoord0Offset], r128(&oT[0]));
		if(oFVF.hasTexture(1))	movlps(qword_ptr [r32(&output)+texcoord1Offset], r128(&oT[1]));
		if(oFVF.hasTexture(2))	movlps(qword_ptr [r32(&output)+texcoord2Offset], r128(&oT[2]));
		if(oFVF.hasTexture(3))	movlps(qword_ptr [r32(&output)+texcoord3Offset], r128(&oT[3]));
		if(oFVF.hasTexture(4))	movlps(qword_ptr [r32(&output)+texcoord4Offset], r128(&oT[4]));
		if(oFVF.hasTexture(5))	movlps(qword_ptr [r32(&output)+texcoord5Offset], r128(&oT[5]));
		if(oFVF.hasTexture(6))	movlps(qword_ptr [r32(&output)+texcoord6Offset], r128(&oT[6]));
		if(oFVF.hasTexture(7))	movlps(qword_ptr [r32(&output)+texcoord7Offset], r128(&oT[7]));
	}

	void *VS_2_0Assembler::reference(const Operand &reg)
	{
		switch(reg.type)
		{
		case Operand::ADDRESS_REGISTER:
			return &a0;
		case Operand::INPUT_REGISTER:
			return &v[reg.index];
		case Operand::CONSTANT_FLOAT_REGISTER:
			return &c[reg.index];
		case Operand::CONSTANT_INTEGER_REGISTER:
			return &i[reg.index];
		case Operand::CONSTANT_BOOLEAN_REGISTER:
			return &b[reg.index];
		case Operand::TEXTURE_COORDINATE_REGISTER:
			return &oT[reg.index];
		case Operand::LOOP_COUNTER_REGISTER:
			return &aL;
		case Operand::TEMPORARY_REGISTER:
			return &r[reg.index];
		case Operand::DIFFUSE_SPECULAR_REGISTER:
			return &oD[reg.index];
		case Operand::POSITION_REGISTER:
			return &oPos;
		case Operand::POINT_SIZE_REGISTER:
			return &oPts;
		case Operand::FOG_REGISTER:
			return &oFog;
	//	case Operand::CONSTANT_FLOAT_LITERAL:
	//		return 0;
		case Operand::INTERNAL_REGISTER:
			return &tmp[reg.index];
		default:
			throw INTERNAL_ERROR;
		}
	}

	const OperandXMMREG VS_2_0Assembler::r128(const Operand &reg, int next)
	{
		checkDcl(reg);

		Operand op = reg;
		op.index += next;

		return CodeGenerator::r128(reference(op));
	}

	const OperandXMMREG VS_2_0Assembler::x128(const Operand &reg, int next)
	{
		checkDcl(reg);

		Operand op = reg;
		op.index += next;

		return CodeGenerator::x128(reference(op));
	}

	const OperandR_M128 VS_2_0Assembler::m128(const Operand &r_m, int next)
	{
		checkDcl(r_m);

		Operand op = r_m;
		op.index += next;

		return CodeGenerator::m128(reference(op));
	}

	const OperandXMM32 VS_2_0Assembler::xmm32(const Operand &r_m, int next)
	{
		return (OperandXMM32&)m128(r_m, next);
	}

	const OperandXMMREG VS_2_0Assembler::r128(const OperandREF &ref)
	{
		return CodeGenerator::r128(ref);
	}

	const OperandXMMREG VS_2_0Assembler::x128(const OperandREF &ref)
	{
		return CodeGenerator::x128(ref);
	}

	const OperandR_M128 VS_2_0Assembler::m128(const OperandREF &ref)
	{
		return CodeGenerator::m128(ref);
	}

	const OperandXMM32 VS_2_0Assembler::xmm32(const OperandREF &ref)
	{
		return (OperandXMM32&)CodeGenerator::m128(ref);
	}

	void VS_2_0Assembler::free(const OperandREF &ref)
	{
		CodeGenerator::free(ref);
	}

	void VS_2_0Assembler::checkDcl(const Operand &op)
	{
		if(op.type == Operand::INPUT_REGISTER)
		{
			if(op.index < 0 || op.index >= 16) throw INTERNAL_ERROR;

			if(vDcl[op.index] == true) return;
			else throw Error("Use of undeclared input register v%d", op.index);
		}
	}

	void VS_2_0Assembler::free(const Operand &tmp)
	{
		CodeGenerator::free(reference(tmp));
	}

	void VS_2_0Assembler::freeTemps()
	{
		free(tmp0);
		free(tmp1);
		free(tmp2);
		free(tmp3);
		free(tmp4);
		free(tmp5);
		free(tmp6);
		free(tmp7);
	}

	void VS_2_0Assembler::VS_2_0()
	{
		return;
	}

	void VS_2_0Assembler::DCL_POSITION0(Operand &dest)
	{
		annotate("DCL_POSITION0(%s)", dest.string());

		if(dest.type != Operand::INPUT_REGISTER) throw Error("Only input registers can be DCL'ed");
		if(dest.index < 0 || dest.index >= 16) throw INTERNAL_ERROR;

		if(!position[0]) throw Error("No stream source for position0");

		input[dest.index].buffer = (void**)&position[0];
		input[dest.index].type = Stream::TYPE_FLOAT3;
		input[dest.index].usage = Stream::USAGE_POSITION0;

		vDcl[dest.index] = true;
		readInput(dest.index);
	}

	void VS_2_0Assembler::DCL_POSITION1(Operand &dest)
	{
		annotate("DCL_POSITION0(%s)", dest.string());

		if(dest.type != Operand::INPUT_REGISTER) throw Error("Only input registers can be DCL'ed");
		if(dest.index < 0 || dest.index >= 16) throw INTERNAL_ERROR;

		if(!position[1]) throw Error("No stream source for position1");

		input[dest.index].buffer = (void**)&position[1];
		input[dest.index].type = Stream::TYPE_FLOAT3;
		input[dest.index].usage = Stream::USAGE_POSITION1;

		vDcl[dest.index] = true;
		readInput(dest.index);
	}

	void VS_2_0Assembler::DCL_BLENDWEIGHT(Operand &dest)
	{
		annotate("DCL_BLENDWEIGHT(%s)", dest.string());

		if(dest.type != Operand::INPUT_REGISTER) throw Error("Only input registers can be DCL'ed");
		if(dest.index < 0 || dest.index >= 16) throw INTERNAL_ERROR;

		if(!blendWeight) throw Error("No stream source for blendWeight");

		input[dest.index].buffer = (void**)&blendWeight;
		input[dest.index].type = Stream::TYPE_FLOAT4;
		input[dest.index].usage = Stream::USAGE_BLENDWEIGHT;

		vDcl[dest.index] = true;
		readInput(dest.index);
	}

	void VS_2_0Assembler::DCL_BLENDINDICES(Operand &dest)
	{
		annotate("DCL_BLENDINDICES(%s)", dest.string());

		if(dest.type != Operand::INPUT_REGISTER) throw Error("Only input registers can be DCL'ed");
		if(dest.index < 0 || dest.index >= 16) throw INTERNAL_ERROR;

		if(!blendIndices) throw Error("No stream source for blendIndices");

		input[dest.index].buffer = (void**)&blendIndices;
		input[dest.index].type = Stream::TYPE_UBYTE4;
		input[dest.index].usage = Stream::USAGE_BLENDINDICES;

		vDcl[dest.index] = true;
		readInput(dest.index);
	}

	void VS_2_0Assembler::DCL_NORMAL0(Operand &dest)
	{
		annotate("DCL_NORMAL0", dest.string());

		if(dest.type != Operand::INPUT_REGISTER) throw Error("Only input registers can be DCL'ed");
		if(dest.index < 0 || dest.index >= 16) throw INTERNAL_ERROR;

		if(!normal[0]) throw Error("No stream source for normal0");

		input[dest.index].buffer = (void**)&normal[0];
		input[dest.index].type = Stream::TYPE_FLOAT3;
		input[dest.index].usage = Stream::USAGE_NORMAL0;

		vDcl[dest.index] = true;
		readInput(dest.index);
	}

	void VS_2_0Assembler::DCL_NORMAL1(Operand &dest)
	{
		annotate("DCL_NORMAL1", dest.string());

		if(dest.type != Operand::INPUT_REGISTER) throw Error("Only input registers can be DCL'ed");
		if(dest.index < 0 || dest.index >= 16) throw INTERNAL_ERROR;

		if(!normal[1]) throw Error("No stream source for normal1");

		input[dest.index].buffer = (void**)&normal[1];
		input[dest.index].type = Stream::TYPE_FLOAT3;
		input[dest.index].usage = Stream::USAGE_NORMAL1;

		vDcl[dest.index] = true;
		readInput(dest.index);
	}

	void VS_2_0Assembler::DCL_PSIZE(Operand &dest)
	{
		annotate("DCL_PSIZE(%s)", dest.string());

		if(dest.type != Operand::INPUT_REGISTER) throw Error("Only input registers can be DCL'ed");
		if(dest.index < 0 || dest.index >= 16) throw INTERNAL_ERROR;

		if(!pSize) throw Error("No stream source for psize");

		input[dest.index].buffer = (void**)&pSize;
		input[dest.index].type = Stream::TYPE_FLOAT;
		input[dest.index].usage = Stream::USAGE_PSIZE;

		vDcl[dest.index] = true;
		readInput(dest.index);
	}

	void VS_2_0Assembler::DCL_TEXCOORD0(Operand &dest)
	{
		annotate("DCL_TEXCOORD0", dest.string());

		if(dest.type != Operand::INPUT_REGISTER) throw Error("Only input registers can be DCL'ed");
		if(dest.index < 0 || dest.index >= 16) throw INTERNAL_ERROR;

		if(!texCoord[0]) throw Error("No stream source for texCoord0");

		input[dest.index].buffer = (void**)&texCoord[0];
		input[dest.index].type = Stream::TYPE_FLOAT2;
		input[dest.index].usage = Stream::USAGE_TEXCOORD0;

		vDcl[dest.index] = true;
		readInput(dest.index);
	}

	void VS_2_0Assembler::DCL_TEXCOORD1(Operand &dest)
	{
		annotate("DCL_TEXCOORD1", dest.string());

		if(dest.type != Operand::INPUT_REGISTER) throw Error("Only input registers can be DCL'ed");
		if(dest.index < 0 || dest.index >= 16) throw INTERNAL_ERROR;

		if(!texCoord[1]) throw Error("No stream source for texCoord1");

		input[dest.index].buffer = (void**)&texCoord[1];
		input[dest.index].type = Stream::TYPE_FLOAT2;
		input[dest.index].usage = Stream::USAGE_TEXCOORD1;

		vDcl[dest.index] = true;
		readInput(dest.index);
	}

	void VS_2_0Assembler::DCL_TEXCOORD2(Operand &dest)
	{
		annotate("DCL_TEXCOORD2", dest.string());

		if(dest.type != Operand::INPUT_REGISTER) throw Error("Only input registers can be DCL'ed");
		if(dest.index < 0 || dest.index >= 16) throw INTERNAL_ERROR;

		if(!texCoord[2]) throw Error("No stream source for texCoord2");

		input[dest.index].buffer = (void**)&texCoord[2];
		input[dest.index].type = Stream::TYPE_FLOAT2;
		input[dest.index].usage = Stream::USAGE_TEXCOORD2;

		vDcl[dest.index] = true;
		readInput(dest.index);
	}

	void VS_2_0Assembler::DCL_TEXCOORD3(Operand &dest)
	{
		annotate("DCL_TEXCOORD3", dest.string());

		if(dest.type != Operand::INPUT_REGISTER) throw Error("Only input registers can be DCL'ed");
		if(dest.index < 0 || dest.index >= 16) throw INTERNAL_ERROR;

		if(!texCoord[3]) throw Error("No stream source for texCoord3");

		input[dest.index].buffer = (void**)&texCoord[3];
		input[dest.index].type = Stream::TYPE_FLOAT2;
		input[dest.index].usage = Stream::USAGE_TEXCOORD3;

		vDcl[dest.index] = true;
		readInput(dest.index);
	}

	void VS_2_0Assembler::DCL_TEXCOORD4(Operand &dest)
	{
		annotate("DCL_TEXCOORD4", dest.string());

		if(dest.type != Operand::INPUT_REGISTER) throw Error("Only input registers can be DCL'ed");
		if(dest.index < 0 || dest.index >= 16) throw INTERNAL_ERROR;

		if(!texCoord[4]) throw Error("No stream source for texCoord4");

		input[dest.index].buffer = (void**)&texCoord[4];
		input[dest.index].type = Stream::TYPE_FLOAT2;
		input[dest.index].usage = Stream::USAGE_TEXCOORD4;

		vDcl[dest.index] = true;
		readInput(dest.index);
	}

	void VS_2_0Assembler::DCL_TEXCOORD5(Operand &dest)
	{
		annotate("DCL_TEXCOORD5", dest.string());

		if(dest.type != Operand::INPUT_REGISTER) throw Error("Only input registers can be DCL'ed");
		if(dest.index < 0 || dest.index >= 16) throw INTERNAL_ERROR;

		if(!texCoord[5]) throw Error("No stream source for texCoord5");

		input[dest.index].buffer = (void**)&texCoord[5];
		input[dest.index].type = Stream::TYPE_FLOAT2;
		input[dest.index].usage = Stream::USAGE_TEXCOORD5;

		vDcl[dest.index] = true;
		readInput(dest.index);
	}

	void VS_2_0Assembler::DCL_TEXCOORD6(Operand &dest)
	{
		annotate("DCL_TEXCOORD6", dest.string());

		if(dest.type != Operand::INPUT_REGISTER) throw Error("Only input registers can be DCL'ed");
		if(dest.index < 0 || dest.index >= 16) throw INTERNAL_ERROR;

		if(!texCoord[6]) throw Error("No stream source for texCoord6");

		input[dest.index].buffer = (void**)&texCoord[6];
		input[dest.index].type = Stream::TYPE_FLOAT2;
		input[dest.index].usage = Stream::USAGE_TEXCOORD6;

		vDcl[dest.index] = true;
		readInput(dest.index);
	}

	void VS_2_0Assembler::DCL_TEXCOORD7(Operand &dest)
	{
		annotate("DCL_TEXCOORD7", dest.string());

		if(dest.type != Operand::INPUT_REGISTER) throw Error("Only input registers can be DCL'ed");
		if(dest.index < 0 || dest.index >= 16) throw INTERNAL_ERROR;

		if(!texCoord[7]) throw Error("No stream source for texCoord7");

		input[dest.index].buffer = (void**)&texCoord[7];
		input[dest.index].type = Stream::TYPE_FLOAT2;
		input[dest.index].usage = Stream::USAGE_TEXCOORD7;

		vDcl[dest.index] = true;
		readInput(dest.index);
	}

	void VS_2_0Assembler::DCL_TANGENT(Operand &dest)
	{
		annotate("DCL_TANGENT(%s)", dest.string());

		if(dest.type != Operand::INPUT_REGISTER) throw Error("Only input registers can be DCL'ed");
		if(dest.index < 0 || dest.index >= 16) throw INTERNAL_ERROR;

		if(!tangent) throw Error("No stream source for tangent");

		input[dest.index].buffer = (void**)&tangent;
		input[dest.index].type = Stream::TYPE_FLOAT3;
		input[dest.index].usage = Stream::USAGE_TANGENT;

		vDcl[dest.index] = true;
		readInput(dest.index);
	}

	void VS_2_0Assembler::DCL_BINORMAL(Operand &dest)
	{
		annotate("DCL_BINORMAL(%s)", dest.string());

		if(dest.type != Operand::INPUT_REGISTER) throw Error("Only input registers can be DCL'ed");
		if(dest.index < 0 || dest.index >= 16) throw INTERNAL_ERROR;

		if(!binormal) throw Error("No stream source for binormal");

		input[dest.index].buffer = (void**)&binormal;
		input[dest.index].type = Stream::TYPE_FLOAT3;
		input[dest.index].usage = Stream::USAGE_BINORMAL;

		vDcl[dest.index] = true;
		readInput(dest.index);
	}

	void VS_2_0Assembler::DCL_TESSFACTOR(Operand &dest)
	{
		annotate("DCL_TESSFACTOR(%s)", dest.string());

		if(dest.type != Operand::INPUT_REGISTER) throw Error("Only input registers can be DCL'ed");
		if(dest.index < 0 || dest.index >= 16) throw INTERNAL_ERROR;

		if(!tessFactor) throw Error("No stream source for tessFactor");

		input[dest.index].buffer = (void**)&tessFactor;
		input[dest.index].type = Stream::TYPE_FLOAT1;
		input[dest.index].usage = Stream::USAGE_TESSFACTOR;

		vDcl[dest.index] = true;
		readInput(dest.index);
	}

	void VS_2_0Assembler::DCL_POSITIONT(Operand &dest)
	{
		annotate("DCL_POSITIONT(%s)", dest.string());

		if(dest.type != Operand::INPUT_REGISTER) throw Error("Only input registers can be DCL'ed");
		if(dest.index < 0 || dest.index >= 16) throw INTERNAL_ERROR;

		if(!positiont) throw Error("No stream source for positiont");

		input[dest.index].buffer = (void**)&positiont;
		input[dest.index].type = Stream::TYPE_FLOAT4;
		input[dest.index].usage = Stream::USAGE_POSITIONT;

		vDcl[dest.index] = true;
		readInput(dest.index);
	}

	void VS_2_0Assembler::DCL_COLOR0(Operand &dest)
	{
		annotate("DCL_COLOR0", dest.string());

		if(dest.type != Operand::INPUT_REGISTER) throw Error("Only input registers can be DCL'ed");
		if(dest.index < 0 || dest.index >= 16) throw INTERNAL_ERROR;

		if(!color[0]) throw Error("No stream source for color0");

		input[dest.index].buffer = (void**)&color[0];
		input[dest.index].type = Stream::TYPE_COLOR;
		input[dest.index].usage = Stream::USAGE_COLOR0;

		vDcl[dest.index] = true;
		readInput(dest.index);
	}

	void VS_2_0Assembler::DCL_COLOR1(Operand &dest)
	{
		annotate("DCL_COLOR1", dest.string());

		if(dest.type != Operand::INPUT_REGISTER) throw Error("Only input registers can be DCL'ed");
		if(dest.index < 0 || dest.index >= 16) throw INTERNAL_ERROR;

		if(!color[1]) throw Error("No stream source for color1");

		input[dest.index].buffer = (void**)&color[1];
		input[dest.index].type = Stream::TYPE_COLOR;
		input[dest.index].usage = Stream::USAGE_COLOR1;

		vDcl[dest.index] = true;
		readInput(dest.index);
	}

	void VS_2_0Assembler::DCL_FOG(Operand &dest)
	{
		annotate("DCL_FOG(%s)", dest.string());

		if(dest.type != Operand::INPUT_REGISTER) throw Error("Only input registers can be DCL'ed");
		if(dest.index < 0 || dest.index >= 16) throw INTERNAL_ERROR;

		if(!fog) throw Error("No stream source for fog");

		input[dest.index].buffer = (void**)&fog;
		input[dest.index].type = Stream::TYPE_FLOAT3;
		input[dest.index].usage = Stream::USAGE_FOG;

		vDcl[dest.index] = true;
		readInput(dest.index);
	}

	void VS_2_0Assembler::DCL_DEPTH(Operand &dest)
	{
		annotate("DCL_DEPTH(%s)", dest.string());

		if(dest.type != Operand::INPUT_REGISTER) throw Error("Only input registers can be DCL'ed");
		if(dest.index < 0 || dest.index >= 16) throw INTERNAL_ERROR;

		if(!depth) throw Error("No stream source for depth");

		input[dest.index].buffer = (void**)&depth;
		input[dest.index].type = Stream::TYPE_FLOAT1;
		input[dest.index].usage = Stream::USAGE_DEPTH;

		vDcl[dest.index] = true;
		readInput(dest.index);
	}

	void VS_2_0Assembler::DCL_SAMPLE(Operand &dest)
	{
		annotate("DCL_SAMPLE(%s)", dest.string());

		if(dest.type != Operand::INPUT_REGISTER) throw Error("Only input registers can be DCL'ed");
		if(dest.index < 0 || dest.index >= 16) throw INTERNAL_ERROR;

		if(!sample) throw Error("No stream source for sample");

		input[dest.index].buffer = (void**)&sample;
		input[dest.index].type = Stream::TYPE_FLOAT4;
		input[dest.index].usage = Stream::USAGE_SAMPLE;

		vDcl[dest.index] = true;
		readInput(dest.index);
	}

	void VS_2_0Assembler::DEF(Operand &dest, FValue1 fValue1, FValue2 fValue2, FValue3 fValue3, FValue4 fValue4)
	{
		annotate("DEF(%s, %s, %s, %s, %s)", dest.string(), fValue1.string(), fValue2.string(), fValue3.string(), fValue4.string());

		((float*)reference(dest))[0] = fValue1.value;
		((float*)reference(dest))[1] = fValue2.value;
		((float*)reference(dest))[2] = fValue3.value;
		((float*)reference(dest))[3] = fValue4.value;
	}

	void VS_2_0Assembler::DEFI(Operand &dest, IntegerValue1 integerValue1, IntegerValue2 integerValue2, IntegerValue3 integerValue3, IntegerValue4 integerValue4)
	{
		annotate("DEFI(%s, %s, %s, %s, %s)", dest.string(), integerValue1.string(), integerValue2.string(), integerValue3.string(), integerValue4.string());

		((int*)reference(dest))[0] = integerValue1.integer;
		((int*)reference(dest))[1] = integerValue2.integer;
		((int*)reference(dest))[2] = integerValue3.integer;
		((int*)reference(dest))[3] = integerValue4.integer;
	}

	void VS_2_0Assembler::DEFB(Operand &dest, BooleanValue booleanValue)
	{
		annotate("DEFB(%s, %s)", dest.string(), booleanValue.string());

		((bool*)reference(dest))[0] = booleanValue.boolean;
	}

	void VS_2_0Assembler::ABS(Operand &dst, const Operand &src)
	{
		annotate("ABS(%s, %s)", dst.string(), src.string());

		static const int4 SIGN_MASK = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};

		NEG_SWIZZLE(tmp0, src);

		andps(r128(tmp0), xmmword_ptr [SIGN_MASK]);

		SAT_MASK(dst, tmp0);
	}

	void VS_2_0Assembler::ADD(Operand &dst, const Operand &src0, const Operand &src1)
	{
		annotate("ADD(%s, %s, %s)", dst.string(), src0.string(), src1.string());

		NEG_SWIZZLE(tmp0, src0);
		NEG_SWIZZLE(tmp1, src1);

		addps(r128(tmp0), m128(tmp1));

		SAT_MASK(dst, tmp0);
	}

	void VS_2_0Assembler::CRS(Operand &dst, const Operand &src0, const Operand &src1)
	{
		annotate("CRS(%s, %s, %s)", dst.string(), src0.string(), src1.string());

		// dest.x = src0.y * src1.z - src0.z * src1.y;
		// dest.y = src0.z * src1.x - src0.x * src1.z;
		// dest.z = src0.x * src1.y - src0.y * src1.x;

		NEG_SWIZZLE(tmp0, src0);
		NEG_SWIZZLE(tmp1, src1);

		movaps(x128(tmp3), m128(tmp0));
		movaps(x128(tmp2), m128(tmp1));
		shufps(r128(tmp3), m128(tmp0), 0xD2);
		shufps(r128(tmp2), m128(tmp1), 0xC9);
		mulps(r128(tmp3), m128(tmp2));
		movaps(x128(tmp2), m128(tmp1));
		shufps(r128(tmp2), m128(tmp1), 0xD2);
		movaps(x128(tmp1), m128(tmp0));
		shufps(r128(tmp1), m128(tmp0), 0xD9);
		mulps(r128(tmp1), m128(tmp2));
		subps(r128(tmp1), m128(tmp3));

		SAT_MASK(dst, tmp1);
	}

	void VS_2_0Assembler::DP3(Operand &dst, const Operand &src0, const Operand &src1)
	{
		annotate("DP3(%s, %s, %s)", dst.string(), src0.string(), src1.string());

		NEG_SWIZZLE(tmp0, src0);
		NEG_SWIZZLE(tmp1, src1);

		mulps(r128(tmp0), m128(tmp1));
		movhlps(r128(tmp1), r128(tmp0));
		addss(r128(tmp1), xmm32(tmp0));
		shufps(r128(tmp0), m128(tmp0), 0x01);
		addss(r128(tmp0), xmm32(tmp1));
		shufps(r128(tmp0), r128(tmp0), 0x00);

		SAT_MOV_XYZW(dst, tmp0);
	}

	void VS_2_0Assembler::DP4(Operand &dst, const Operand &src0, const Operand &src1)
	{
		annotate("DP4(%s, %s, %s)", dst.string(), src0.string(), src1.string());

		NEG_SWIZZLE(tmp0, src0);
		NEG_SWIZZLE(tmp1, src1);

		mulps(r128(tmp0), m128(tmp1));
		movhlps(r128(tmp1), r128(tmp0));
		addps(r128(tmp0), m128(tmp1));
		movss(x128(tmp1), xmm32(tmp0));
		shufps(r128(tmp0), m128(tmp0), 0x01);
		addss(r128(tmp1), xmm32(tmp0));
		shufps(r128(tmp1), r128(tmp1), 0x00);

		SAT_MOV_XYZW(dst, tmp1);
	}

	void VS_2_0Assembler::EXP(Operand &dst, const Operand &src)
	{
		annotate("EXP(%s, %s)", dst.string(), src.string());

		static const float A = -2.91421356e+0f;

		// Safe limits
		static const float M = -100;
		static const float N = 100;

		static float TEMP;
		static int r;

		NEG_SWIZZLE(tmp0, src);

		minss(r128(tmp0), dword_ptr [&N]);
		maxss(r128(tmp0), dword_ptr [&M]);
		movss(x128(tmp1), xmm32(tmp0));
		cvtss2si(x32(&r), xmm32(tmp1));
		cvtsi2ss(r128(tmp1), r32(&r));
		subss(r128(tmp0), xmm32(tmp1));
		movss(x128(tmp1), dword_ptr [&A]);
		subss(r128(tmp1), xmm32(tmp0));
		addss(r128(tmp0), dword_ptr [&A]);
		rcpss(r128(tmp0), xmm32(tmp0));
		mulss(r128(tmp0), xmm32(tmp1));
		add(r32(&r), 127);
		shl(r32(&r), 23);
		mov(dword_ptr [&TEMP], r32(&r));		free(&r);
		mulss(r128(tmp0), dword_ptr [&TEMP]);
		shufps(r128(tmp0), r128(tmp0), 0x00);

		SAT_MASK(dst, tmp0);
	}

	void VS_2_0Assembler::EXPP(Operand &dst, const Operand &src)
	{
		annotate("EXPP(%s, %s)", dst.string(), src.string());

		static const float A = -2.91421356e+0f;

		static float TEMP;
		static int r;

		NEG_SWIZZLE(tmp0, src);

		movss(x128(tmp1), xmm32(tmp0));
		cvtss2si(x32(&r), xmm32(tmp1));
		cvtsi2ss(r128(tmp1), r32(&r));
		subss(r128(tmp0), xmm32(tmp1));
		movss(x128(tmp1), dword_ptr [&A]);
		subss(r128(tmp1), xmm32(tmp0));
		addss(r128(tmp0), dword_ptr [&A]);
		rcpss(r128(tmp0), xmm32(tmp0));
		mulss(r128(tmp0), xmm32(tmp1));
		add(r32(&r), 127);
		shl(r32(&r), 23);
		mov(dword_ptr [&TEMP], r32(&r));		free(&r);
		mulss(r128(tmp0), dword_ptr [&TEMP]);
		shufps(r128(tmp0), r128(tmp0), 0x00);

		SAT_MASK(dst, tmp0);
	}

	void VS_2_0Assembler::FRC(Operand &dst, const Operand &src)
	{
		annotate("FRC(%s, %s)", dst.string(), src.string());

		static const float4 HALF = {0.5f, 0.5f, 0.5f, 0.5f};
		static const int4 MASK = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};

		NEG_SWIZZLE(tmp0, src);

		static qword c0;
		static qword c1;

		andps(r128(tmp0), xmmword_ptr [MASK]);
		subps(r128(tmp0), xmmword_ptr [HALF]);
		cvtps2pi(x64(&c0), r128(tmp0));
		movhlps(r128(tmp1), r128(tmp0));
		cvtps2pi(x64(&c1), r128(tmp1));
		cvtpi2ps(r128(tmp1), r64(&c1));			free(&c1);
		movlhps(r128(tmp1), r128(tmp1));
		cvtpi2ps(r128(tmp1), r64(&c0));			free(&c0);
		addps(r128(tmp0), xmmword_ptr [HALF]);
		subps(r128(tmp0), m128(tmp1));

		SAT_MASK(dst, tmp0);
	}

	void VS_2_0Assembler::LIT(Operand &dst, const Operand &src)
	{
		annotate("LIT(%s, %s)", dst.string(), src.string());

		static const float4 init = {1, 0, 0, 1};

		movaps(x128(dst), xword_ptr [&init]);

		Operand &power = tmp0;

		movaps(x128(power), r128(src));
		shufps(r128(power), r128(power), 0xFF);

		static const float zero = 0;

		comiss(x128(src), dword_ptr [&zero]);

		spillAll();
		jng("nolit");

		movlhps(r128(dst), r128(src));
		shufps(r128(dst), r128(dst), 0x38);

		movhlps(x128(tmp1), r128(dst));
		comiss(x128(tmp1), dword_ptr [&zero]);
		jng("nolit");

		POW(tmp1, tmp1, power);
		movlhps(r128(dst), r128(tmp1));

		spillAll();
	label("nolit");

		freeTemps();
	}

	void VS_2_0Assembler::LOG(Operand &dst, const Operand &src)
	{
		annotate("LOG(%s, %s)", dst.string(), src.string());

		static const float A =  3.42234550e0f;
		static const float B =  1.42234550e0f;

		static float TEMP1;
		static float TEMP2;
		static int r;

		NEG_SWIZZLE(tmp0, src);

		movss(dword_ptr [&TEMP1], r128(tmp0));
		mov(x32(&r), dword_ptr [&TEMP1]);
		and(r32(&r), 0x007FFFFF);
		or(r32(&r), 0x3F800000);
		mov(dword_ptr [&TEMP2], r32(&r));
		movss(x128(tmp0), dword_ptr [&TEMP2]);
		movss(x128(tmp1), xmm32(tmp0));
		mulss(r128(tmp0), dword_ptr [&A]);
		subss(r128(tmp0), dword_ptr [&A]);
		addss(r128(tmp1), dword_ptr [&B]);
		rcpss(r128(tmp1), xmm32(tmp1));
		mulss(r128(tmp0), xmm32(tmp1));
		mov(x32(&r), dword_ptr [&TEMP1]);
		and(r32(&r), 0x7F800000);
		shr(r32(&r), 23);
		sub(r32(&r), 127);
		cvtsi2ss(r128(tmp1), r32(&r));			free(&r);
		addss(r128(tmp0), xmm32(tmp1));
		shufps(r128(tmp0), r128(tmp0), 0x00);

		SAT_MASK(dst, tmp0);
	}

	void VS_2_0Assembler::LOGP(Operand &dst, const Operand &src)
	{
		annotate("LOGP(%s, %s)", dst.string(), src.string());

		static const float A =  3.42234550e0f;
		static const float B =  1.42234550e0f;

		static float TEMP1;
		static float TEMP2;
		static int r;

		NEG_SWIZZLE(tmp0, src);

		movss(dword_ptr [&TEMP1], r128(tmp0));
		mov(x32(&r), dword_ptr [&TEMP1]);
		and(r32(&r), 0x007FFFFF);
		or(r32(&r), 0x3F800000);
		mov(dword_ptr [&TEMP2], r32(&r));
		movss(x128(tmp0), dword_ptr [&TEMP2]);
		movss(x128(tmp1), xmm32(tmp0));
		mulss(r128(tmp0), dword_ptr [&A]);
		subss(r128(tmp0), dword_ptr [&A]);
		addss(r128(tmp1), dword_ptr [&B]);
		rcpss(r128(tmp1), xmm32(tmp1));
		mulss(r128(tmp0), xmm32(tmp1));
		mov(x32(&r), dword_ptr [&TEMP1]);
		and(r32(&r), 0x7F800000);
		shr(r32(&r), 23);
		sub(r32(&r), 127);
		cvtsi2ss(r128(tmp1), r32(&r));			free(&r);
		addss(r128(tmp0), xmm32(tmp1));
		shufps(r128(tmp0), r128(tmp0), 0x00);

		SAT_MASK(dst, tmp0);
	}

	void VS_2_0Assembler::LRP(Operand &dst, const Operand &src0, const Operand &src1, const Operand &src2)
	{
		annotate("ADD(%s, %s, %s, %s)", dst.string(), src0.string(), src1.string(), src2.string());

		// dest = src2 + src0 * (src1 - src2)

		NEG_SWIZZLE(tmp0, src0);
		NEG_SWIZZLE(tmp1, src1);
		NEG_SWIZZLE(tmp2, src2);

		subps(r128(tmp1), m128(tmp2));
		mulps(r128(tmp1), m128(tmp0));
		addps(r128(tmp1), m128(tmp2));

		SAT_MASK(dst, tmp1);
	}

	void VS_2_0Assembler::M3X2(Operand &dst, const Operand &src0, const Operand &src1)
	{
		annotate("M3X2(%s, %s, %s)", dst.string(), src0.string(), src1.string());

		movaps(x128(tmp0), m128(src1, 0));
		movaps(x128(tmp1), m128(src1, 1));
		movaps(x128(tmp2), m128(tmp0));
		movaps(x128(tmp3), m128(tmp1)); 

		unpckhps(r128(tmp0), m128(tmp0));
		unpckhps(r128(tmp1), m128(tmp1));
		movaps(x128(tmp4), m128(tmp0));
		unpcklps(r128(tmp4), m128(tmp1));

		unpcklps(r128(tmp2), m128(tmp2));
		unpcklps(r128(tmp3), m128(tmp3));
		movaps(x128(tmp5), m128(tmp2));
		unpckhps(r128(tmp2), m128(tmp3));
		unpcklps(r128(tmp5), m128(tmp3));

		NEG_SWIZZLE(tmp1, src0);

		movaps(x128(tmp6), m128(tmp1));
		movaps(x128(tmp7), m128(tmp1));
		shufps(r128(tmp1), m128(tmp1), 0x00);
		shufps(r128(tmp6), m128(tmp6), 0x55);
		shufps(r128(tmp7), m128(tmp7), 0xAA);

		mulps(r128(tmp1), m128(tmp5));
		mulps(r128(tmp6), m128(tmp2));
		mulps(r128(tmp7), m128(tmp4));

		addps(r128(tmp1), m128(tmp6));
		addps(r128(tmp1), m128(tmp7));

		SAT_MOV_XY(dst, tmp1);
	}

	void VS_2_0Assembler::M3X3(Operand &dst, const Operand &src0, const Operand &src1)
	{
		annotate("M3X3(%s, %s, %s)", dst.string(), src0.string(), src1.string());

		movaps(x128(tmp0), m128(src1, 0));
		movaps(x128(tmp1), m128(src1, 1));
		movaps(x128(tmp2), m128(tmp0));
		movaps(x128(tmp3), m128(tmp1)); 

		unpckhps(r128(tmp0), m128(src1, 2));
		unpckhps(r128(tmp1), m128(tmp1));
		movaps(x128(tmp4), m128(tmp0));
		unpcklps(r128(tmp4), m128(tmp1));

		unpcklps(r128(tmp2), m128(src1, 2));
		unpcklps(r128(tmp3), m128(tmp3));
		movaps(x128(tmp5), m128(tmp2));
		unpckhps(r128(tmp2), m128(tmp3));
		unpcklps(r128(tmp5), m128(tmp3));

		NEG_SWIZZLE(tmp1, src0);

		movaps(x128(tmp6), m128(tmp1));
		movaps(x128(tmp7), m128(tmp1));
		shufps(r128(tmp1), m128(tmp1), 0x00);
		shufps(r128(tmp6), m128(tmp6), 0x55);
		shufps(r128(tmp7), m128(tmp7), 0xAA);

		mulps(r128(tmp1), m128(tmp5));
		mulps(r128(tmp6), m128(tmp2));
		mulps(r128(tmp7), m128(tmp4));

		addps(r128(tmp1), m128(tmp6));
		addps(r128(tmp1), m128(tmp7));

		SAT_MOV_XYZ(dst, tmp1);
	}

	void VS_2_0Assembler::M3X4(Operand &dst, const Operand &src0, const Operand &src1)
	{
		annotate("M3X4(%s, %s, %s)", dst.string(), src0.string(), src1.string());

		movaps(x128(tmp0), m128(src1, 0));
		movaps(x128(tmp1), m128(src1, 1));
		movaps(x128(tmp2), m128(tmp0));
		movaps(x128(tmp3), m128(tmp1)); 

		unpckhps(r128(tmp0), m128(src1, 2));
		unpckhps(r128(tmp1), m128(src1, 3));
		movaps(x128(tmp4), m128(tmp0));
		unpcklps(r128(tmp4), m128(tmp1));

		unpcklps(r128(tmp2), m128(src1, 2));
		unpcklps(r128(tmp3), m128(src1, 3));
		movaps(x128(tmp5), m128(tmp2));
		unpckhps(r128(tmp2), m128(tmp3));
		unpcklps(r128(tmp5), m128(tmp3));

		NEG_SWIZZLE(tmp1, src0);

		movaps(x128(tmp6), m128(tmp1));
		movaps(x128(tmp7), m128(tmp1));
		shufps(r128(tmp1), m128(tmp1), 0x00);
		shufps(r128(tmp6), m128(tmp6), 0x55);
		shufps(r128(tmp7), m128(tmp7), 0xAA);

		mulps(r128(tmp1), m128(tmp5));
		mulps(r128(tmp6), m128(tmp2));
		mulps(r128(tmp7), m128(tmp4));

		addps(r128(tmp1), m128(tmp6));
		addps(r128(tmp1), m128(tmp7));

		SAT_MOV_XYZW(dst, tmp1);
	}

	void VS_2_0Assembler::M4X3(Operand &dst, const Operand &src0, const Operand &src1)
	{
		annotate("M4X3(%s, %s, %s)", dst.string(), src0.string(), src1.string());

		movaps(x128(tmp0), m128(src1, 0));
		movaps(x128(tmp1), m128(src1, 1));
		movaps(x128(tmp2), m128(tmp0));
		movaps(x128(tmp3), m128(tmp1)); 

		unpckhps(r128(tmp0), m128(src1, 2));
		unpckhps(r128(tmp1), m128(tmp1));
		movaps(x128(tmp4), m128(tmp0)); 
		unpckhps(r128(tmp0), m128(tmp1));
		unpcklps(r128(tmp4), m128(tmp1));

		unpcklps(r128(tmp2), m128(src1, 2));
		unpcklps(r128(tmp3), m128(tmp3));
		movaps(x128(tmp5), m128(tmp2));
		unpckhps(r128(tmp2), m128(tmp3));
		unpcklps(r128(tmp5), m128(tmp3));

		NEG_SWIZZLE(tmp1, src0);

		movaps(x128(tmp6), m128(tmp1));
		movaps(x128(tmp7), m128(tmp1));
		movaps(x128(tmp3), m128(tmp1));
		shufps(r128(tmp1), m128(tmp1), 0x00);
		shufps(r128(tmp6), m128(tmp6), 0x55);
		shufps(r128(tmp7), m128(tmp7), 0xAA);
		shufps(r128(tmp3), m128(tmp3), 0xFF);

		mulps(r128(tmp1), m128(tmp5));
		mulps(r128(tmp6), m128(tmp2));
		mulps(r128(tmp7), m128(tmp4));
		mulps(r128(tmp3), m128(tmp0));

		addps(r128(tmp1), m128(tmp6));
		addps(r128(tmp7), m128(tmp3));
		addps(r128(tmp1), m128(tmp7));

		SAT_MOV_XYZ(dst, tmp1);
	}

	void VS_2_0Assembler::M4X4(Operand &dst, const Operand &src0, const Operand &src1)
	{
		annotate("M4X4(%s, %s, %s)", dst.string(), src0.string(), src1.string());

		movaps(x128(tmp0), m128(src1, 0));
		movaps(x128(tmp1), m128(src1, 1));
		movaps(x128(tmp2), m128(tmp0));
		movaps(x128(tmp3), m128(tmp1)); 

		unpckhps(r128(tmp0), m128(src1, 2));
		unpckhps(r128(tmp1), m128(src1, 3));
		movaps(x128(tmp4), m128(tmp0)); 
		unpckhps(r128(tmp0), m128(tmp1));
		unpcklps(r128(tmp4), m128(tmp1));

		unpcklps(r128(tmp2), m128(src1, 2));
		unpcklps(r128(tmp3), m128(src1, 3));
		movaps(x128(tmp5), m128(tmp2));
		unpckhps(r128(tmp2), m128(tmp3));
		unpcklps(r128(tmp5), m128(tmp3));

		NEG_SWIZZLE(tmp1, src0);

		movaps(x128(tmp6), m128(tmp1));
		movaps(x128(tmp7), m128(tmp1));
		movaps(x128(tmp3), m128(tmp1));
		shufps(r128(tmp1), m128(tmp1), 0x00);
		shufps(r128(tmp6), m128(tmp6), 0x55);
		shufps(r128(tmp7), m128(tmp7), 0xAA);
		shufps(r128(tmp3), m128(tmp3), 0xFF);

		mulps(r128(tmp1), m128(tmp5));
		mulps(r128(tmp6), m128(tmp2));
		mulps(r128(tmp7), m128(tmp4));
		mulps(r128(tmp3), m128(tmp0));

		addps(r128(tmp1), m128(tmp6));
		addps(r128(tmp7), m128(tmp3));
		addps(r128(tmp1), m128(tmp7));

		SAT_MOV_XYZW(dst, tmp1);
	}

	void VS_2_0Assembler::MAD(Operand &dst, const Operand &src0, const Operand &src1, const Operand &src2)
	{
		annotate("MAD(%s, %s, %s, %s)", dst.string(), src0.string(), src1.string(), src2.string());

		NEG_SWIZZLE(tmp0, src0);
		NEG_SWIZZLE(tmp1, src1);
		NEG_SWIZZLE(tmp2, src2);

		mulps(r128(tmp0), m128(tmp1));
		addps(r128(tmp0), m128(tmp2));

		SAT_MASK(dst, tmp0);
	}

	void VS_2_0Assembler::MAX(Operand &dst, const Operand &src0, const Operand &src1)
	{
		annotate("MAX(%s, %s, %s)", dst.string(), src0.string(), src1.string());

		NEG_SWIZZLE(tmp0, src0);
		NEG_SWIZZLE(tmp1, src1);

		maxps(r128(tmp0), m128(tmp1));

		SAT_MASK(dst, tmp0);
	}

	void VS_2_0Assembler::MIN(Operand &dst, const Operand &src0, const Operand &src1)
	{
		annotate("MIN(%s, %s, %s)", dst.string(), src0.string(), src1.string());

		NEG_SWIZZLE(tmp0, src0);
		NEG_SWIZZLE(tmp1, src1);

		minps(r128(tmp0), m128(tmp1));

		SAT_MASK(dst, tmp0);
	}

	void VS_2_0Assembler::MOV(Operand &dst, const Operand &src)
	{
		annotate("MOV(%s, %s)", dst.string(), src.string());

		// If it's an output register, update output FVF
		switch(dst.type)
		{
		case Operand::DIFFUSE_SPECULAR_REGISTER:
			if(dst.index == 0)      oFVF = oFVF | FVF_DIFFUSE;
			else if(dst.index == 1) oFVF = oFVF | FVF_SPECULAR;
			else throw INTERNAL_ERROR;
			break;
		case Operand::POSITION_REGISTER:
			if(dst.index == 0)      oFVF = oFVF | FVF_POSITION;
			else if(dst.index == 1) oFVF = oFVF | FVF_POSITION;
			else throw INTERNAL_ERROR;
			break;
		case Operand::POINT_SIZE_REGISTER:
			oFVF = oFVF | FVF_POINT_SIZE;
			break;
		case Operand::TEXTURE_COORDINATE_REGISTER:
			oFVF = oFVF | (1 << (FVF_TEX_SHIFT + dst.index));
			break;
		case Operand::FOG_REGISTER:
			oFVF = oFVF | FVF_FOG;
			break;
		default:
			break;
		}

		NEG_SWIZZLE(tmp0, src);
		SAT_MASK(dst, tmp0);
	}

	void VS_2_0Assembler::MOVA(Operand &dst, const Operand &src)
	{
		annotate("MOVA(%s, %s)", dst.string(), src.string());

		cvtss2si(r32(&aL), r128(src));
	}

	void VS_2_0Assembler::MUL(Operand &dst, const Operand &src0, const Operand &src1)
	{
		annotate("MUL(%s, %s, %s)", dst.string(), src0.string(), src1.string());

		NEG_SWIZZLE(tmp0, src0);
		NEG_SWIZZLE(tmp1, src1);

		mulps(r128(tmp0), r128(tmp1));

		SAT_MASK(dst, tmp0);
	}

	void VS_2_0Assembler::NOP()
	{
		annotate("NOP()");

		nop();
	}

	void VS_2_0Assembler::NRM(Operand &dst, const Operand &src)
	{
		annotate("NRM(%s, %s)", dst.string(), src.string());

		NEG_SWIZZLE(tmp0, src);

		mulps(r128(tmp0), m128(tmp0));
		movhlps(r128(tmp1), r128(tmp0));
		addss(r128(tmp1), xmm32(tmp0));
		shufps(r128(tmp0), m128(tmp0), 0x01);
		addss(r128(tmp0), xmm32(tmp1));

		rsqrtss(r128(tmp0), xmm32(tmp0));
		shufps(r128(tmp0), m128(tmp0), 0x00);
		mulps(r128(tmp0), m128(src));

		SAT_MASK(dst, tmp0);
	}

	void VS_2_0Assembler::POW(Operand &dst, const Operand &src0, const Operand &src1)
	{
		annotate("POW(%s, %s, %s)", dst.string(), src0.string(), src1.string());

		LOG(tmp0, src0);
		NEG_SWIZZLE(tmp1, src1);

		mulss(r128(tmp0), xmm32(tmp1));

		EXP(dst, tmp0);
	}

	void VS_2_0Assembler::RCP(Operand &dst, const Operand &src)
	{
		annotate("RCP(%s, %s)", dst.string(), src.string());

		NEG_SWIZZLE(tmp0, src);

		if(instruction->modifier == Instruction::_PP)
		{
			rcpss(r128(tmp1), xmm32(tmp0));
		}
		else
		{
			movss(x128(tmp2), xmm32(tmp0));
			rcpss(r128(tmp1), xmm32(tmp0));
			mulss(r128(tmp2), xmm32(tmp1));
			mulss(r128(tmp2), xmm32(tmp1));
			addss(r128(tmp1), xmm32(tmp1));
			subss(r128(tmp1), xmm32(tmp2));
		}

		shufps(r128(tmp1), r128(tmp1), 0x00);

		SAT_MASK(dst, tmp1);
	}

	void VS_2_0Assembler::RSQ(Operand &dst, const Operand &src)
	{
		annotate("RSQ(%s, %s)", dst.string(), src.string());

		NEG_SWIZZLE(tmp0, src);

		if(instruction->modifier == Instruction::_PP)
		{
			rsqrtss(r128(tmp1), xmm32(tmp0));
		}
		else
		{
			static const float THREE = 3.0f;
			static const float HALF = 0.5f;

			rsqrtss(r128(tmp2), xmm32(tmp0));
			movss(x128(tmp1), xmm32(tmp2));
			mulss(r128(tmp2), xmm32(tmp2));
			mulss(r128(tmp2), xmm32(tmp0));
			movss(x128(tmp0), dword_ptr [&THREE]);
			subss(r128(tmp0), xmm32(tmp2));
			mulss(r128(tmp1), xmm32(tmp0));
			mulss(r128(tmp1), dword_ptr [&HALF]);
		}

		shufps(r128(tmp1), r128(tmp1), 0x00);

		SAT_MASK(dst, tmp1);
	}

	void VS_2_0Assembler::SGE(Operand &dst, const Operand &src0, const Operand &src1)
	{
		annotate("SGE(%s, %s, %s)", dst.string(), src0.string(), src1.string());

		// dest.xyzw = (src0.xyzw >= src1.xyzw) ? 1.0f : 0.0f;

		static const float4 one = {1, 1, 1, 1};

		NEG_SWIZZLE(tmp0, src0);
		NEG_SWIZZLE(tmp1, src1);

		cmpnltps(r128(tmp0), r128(tmp1));
		andps(r128(tmp0), xword_ptr [&one]);

		SAT_MASK(dst, tmp0);
	}

	void VS_2_0Assembler::SGN(Operand &dst, const Operand &src0, const Operand &src1, const Operand &src2)
	{
		annotate("SGN(%s, %s, %s, %s)", dst.string(), src0.string(), src1.string(), src2.string());

		static const float4 zero = {0, 0, 0, 0};
		static const float4 pos = {1, 1, 1, 1};
		static const float4 neg = {-1, -1, -1, -1};

		NEG_SWIZZLE(tmp1, src0);
		NEG_SWIZZLE(tmp2, src0);

		cmpltps(r128(tmp1), xword_ptr [&zero]);
		andps(r128(tmp1), xword_ptr [&neg]);
		movaps(x128(tmp0), r128(tmp1));

		cmpnltps(r128(tmp2), xword_ptr [&zero]);
		andps(r128(tmp2), xword_ptr [&pos]);
		orps(r128(tmp0), r128(tmp2));

		SAT_MASK(dst, tmp0);
	}

	void VS_2_0Assembler::SINCOS(Operand &dst, const Operand &src0, const Operand &src1, const Operand &src2)
	{
		annotate("SINCOS(%s, %s, %s, %s)", dst.string(), src0.string(), src1.string(), src2.string());

		static const float C = -4.96818924e-1f;
		static const float B =  3.95277743e-2f;
		static const float A = -9.84989568e-4f;

		static const float ONE = 1.0f;

		NEG_SWIZZLE(tmp0, src0);

		mulss(r128(tmp0), xmm32(tmp0));
		movss(x128(tmp1), xmm32(tmp0));
		mulss(r128(tmp0), dword_ptr [&C]);
		mulss(r128(tmp1), xmm32(tmp1));
		movss(x128(tmp2), xmm32(tmp1));
		mulss(r128(tmp1), dword_ptr [&B]);
		addss(r128(tmp0), xmm32(tmp1));
		mulss(r128(tmp2), xmm32(tmp0));
		mulss(r128(tmp2), dword_ptr [&A]);
		addss(r128(tmp0), xmm32(tmp2));
		addss(r128(tmp0), dword_ptr [&ONE]);
		movss(x128(tmp1), xmm32(tmp0));
		mulss(r128(tmp1), xmm32(tmp1));
		movss(x128(tmp2), dword_ptr [&ONE]);
		subss(r128(tmp2), xmm32(tmp1));
		rsqrtss(r128(tmp2), xmm32(tmp2));
		rcpss(r128(tmp2), xmm32(tmp2));
		movlhps(r128(tmp0), r128(tmp2));
		shufps(r128(tmp0), m128(tmp0), 0x08);

		SAT_MOV_XY(dst, tmp0);
	}

	void VS_2_0Assembler::SUB(Operand &dst, const Operand &src0, const Operand &src1)
	{
		annotate("SUB(%s, %s, %s)", dst.string(), src0.string(), src1.string());

		NEG_SWIZZLE(tmp0, src0);
		NEG_SWIZZLE(tmp1, src1);

		subps(r128(tmp0), m128(tmp1));

		SAT_MASK(dst, tmp0);
	}

	// Flow-control instructions

	void VS_2_0Assembler::CALL(const Operand &label)
	{
		annotate("ADD(%s)", label.string());

		spillAll();

		switch(label.index)
		{
		case 0:		call("l0");		break;
		case 1:		call("l1");		break;
		case 2:		call("l2");		break;
		case 3:		call("l3");		break;
		case 4:		call("l4");		break;
		case 5:		call("l5");		break;
		case 6:		call("l6");		break;
		case 7:		call("l7");		break;
		case 8:		call("l8");		break;
		case 9:		call("l9");		break;
		case 10:	call("l10");	break;
		case 11:	call("l10");	break;
		case 12:	call("l11");	break;
		case 13:	call("l12");	break;
		case 14:	call("l13");	break;
		case 15:	call("l14");	break;
		case 16:	call("l15");	break;
		default:
			throw Error("Label index (l%d) out of range", label.index);
		}
	}

	void VS_2_0Assembler::CALLNZ(const Operand &l, const Operand &boolRegister)
	{
		annotate("CALLNZ(%s)", l.string(), boolRegister.string());

		cmp(byte_ptr [&b[boolRegister.index]], (char)0);

		spillAll();
		jz("nocall");

		switch(l.index)
		{
		case 0: call("l0");		break;
		case 1: call("l1");		break;
		case 2: call("l2");		break;
		case 3: call("l3");		break;
		case 4: call("l4");		break;
		case 5: call("l5");		break;
		case 6: call("l6");		break;
		case 7: call("l7");		break;
		case 8: call("l8");		break;
		case 9: call("l9");		break;
		case 10: call("l10");	break;
		case 11: call("l10");	break;
		case 12: call("l11");	break;
		case 13: call("l12");	break;
		case 14: call("l13");	break;
		case 15: call("l14");	break;
		case 16: call("l15");	break;
		default:
			throw Error("Label index (l%d) out of range", l.index);
		}

		spillAll();
		label("nocall");
	}

	void VS_2_0Assembler::ELSE()
	{
		annotate("ELSE()");

		spillAll();
		label("else");
	}

	void VS_2_0Assembler::ENDIF()
	{
		annotate("ENDIF()");

		spillAll();
		label("else");
		spillAll();
		label("endif");
	}

	void VS_2_0Assembler::ENDREP()
	{
		annotate("ENDREP()");

		spillAll();
		jmp("rep");
		spillAll();
		label("endrep");
	}

	void VS_2_0Assembler::ENDLOOP()
	{
		annotate("ENDLOOP()");

		spillAll();
		jmp("loop");
		spillAll();
		label("endloop");
	}

	void VS_2_0Assembler::IF(const Operand &boolRegister)
	{
		annotate("IF(%s)", boolRegister.string());

		cmp(byte_ptr [&b[boolRegister.index]], (char)0);

		spillAll();
		jnz("else");
	}

	void VS_2_0Assembler::LABEL(const Operand &l)
	{
		annotate("LABEL(%s, %s, %s)", l.string());

		spillAll();

		switch(l.index)
		{
		case 0: label("l0");	break;
		case 1: label("l1");	break;
		case 2: label("l2");	break;
		case 3: label("l3");	break;
		case 4: label("l4");	break;
		case 5: label("l5");	break;
		case 6: label("l6");	break;
		case 7: label("l7");	break;
		case 8: label("l8");	break;
		case 9: label("l9");	break;
		case 10: label("l10");	break;
		case 11: label("l10");	break;
		case 12: label("l11");	break;
		case 13: label("l12");	break;
		case 14: label("l13");	break;
		case 15: label("l14");	break;
		case 16: label("l15");	break;
		default:
			throw Error("Label index (l%d) out of range", l.index);
		}
	}

	void VS_2_0Assembler::LOOP(const Operand &aL, const Operand &integerRegister)
	{
		annotate("LOOP(%s, %s)", aL.string(), integerRegister.string());

		spillAll();
		label("loop");

		cmp(dword_ptr [&aL], 0);

		spillAll();
		jz("endloop");
		dec(dword_ptr [&al]);
	}

	void VS_2_0Assembler::REP(const Operand &integerRegister)
	{
		annotate("REP(%s)", integerRegister.string());

		static int rep;

		mov(x32(&rep), dword_ptr [&i[integerRegister.index]]);

		spillAll();
		label("rep");

		cmp(r32(&rep), 0);

		spillAll();
		jz("endrep");
		dec(r32(&rep));
	}

	void VS_2_0Assembler::RET()
	{
		annotate("RET()");

		spillAll();
		ret();
	}

	// Helper macro instructions

	void VS_2_0Assembler::NEG(Operand &tmp, const Operand &src)
	{
		static const int4 SIGN_MASK = {0x80000000, 0x80000000, 0x80000000, 0x80000000};

		movaps(x128(tmp), m128(src));

		if(src.mod == Operand::NEGATE)
		{
			xorps(r128(tmp), xmmword_ptr [SIGN_MASK]);
		}
	}

	void VS_2_0Assembler::SWIZZLE(Operand &tmp, const Operand &src)
	{
		movaps(x128(tmp), m128(src));
		shufps(r128(tmp), r128(tmp), src.swizzle());
	}

	void VS_2_0Assembler::MASK(Operand &dst, Operand &tmp)
	{
		if(dst.type == Operand::OUTPUT_COLOR_REGISTER)
		{
			movaps(x128(dst), m128(tmp));
			return;
		}

		if(dst.sel == xMask)
		{
			movss(x128(dst), xmm32(tmp));
		}
		else if(dst.sel == xyzwMask)
		{
			movaps(x128(dst), r128(tmp));
		/*
			// Free old dst and make tmp the new dst
			for(int i = 0; i < 8; i++)
			{
				if(xmm[i] == dst)
				{
					free(i);
					break;
				}
			}

			for(i = 0; i < 8; i++)
			{
				if(xmm[i] == tmp)
				{
					free(i);
					allocate(i, dst);
					break;
				}
			}

			// Not allocated to a register
			if(i == 8)
			{
				movaps(x128(dst), m128(tmp));
			}
		*/
		}
		else
		{
			static const int4 MASK[] = {{-1,  0,  0,  0},	// x
			                            { 0, -1,  0,  0},	// y
			                            { 0,  0, -1,  0},	// z
		                                { 0,  0,  0, -1},	// w
			                            {-1, -1,  0,  0},	// xy
			                            {-1,  0, -1,  0},	// xz
		                                {-1,  0,  0, -1},	// xw
			                            { 0, -1, -1,  0},	// yz
			                            { 0, -1,  0, -1},	// yw
			                            { 0,  0, -1, -1},	// zw
			                            {-1, -1, -1,  0},	// xyz
			                            {-1, -1,  0, -1},	// xyw
		                                {-1,  0, -1, -1},	// xzw
			                            { 0, -1, -1, -1},	// yzw
			                            {-1, -1, -1, -1}};	// xyzw

			int m = -1;

			if(dst.sel == xMask) m = 0;
			if(dst.sel == yMask) m = 1;
			if(dst.sel == zMask) m = 2;
			if(dst.sel == wMask) m = 3;
			if(dst.sel == xyMask) m = 4;
			if(dst.sel == xzMask) m = 5;
			if(dst.sel == xwMask) m = 6;
			if(dst.sel == yzMask) m = 7;
			if(dst.sel == ywMask) m = 8;
			if(dst.sel == zwMask) m = 9;
			if(dst.sel == xyzMask) m = 10;
			if(dst.sel == xywMask) m = 11;
			if(dst.sel == xzwMask) m = 12;
			if(dst.sel == yzwMask) m = 13;
			if(dst.sel == xyzwMask) m = 14;

			if(m == -1) throw INTERNAL_ERROR;

			subps(r128(tmp), m128(dst));
			andps(r128(tmp), xmmword_ptr [&MASK[m]]);
			addps(r128(dst), m128(tmp));
		}
	}

	void VS_2_0Assembler::SAT(Operand &dst, Operand &tmp)
	{
		if(instruction->modifier == Instruction::_SAT)
		{
			static const float4 ZERO = {0, 0, 0, 0};
			static const float4 ONE = {1, 1, 1, 1};

			movaps(x128(dst), m128(tmp));
			maxps(r128(dst), xmmword_ptr [ZERO]);
			minps(r128(dst), xmmword_ptr [ONE]);
		}
	}

	void VS_2_0Assembler::NEG_SWIZZLE(Operand &tmp, const Operand &src)
	{
		NEG(tmp, src);
		tmp.sel = src.sel;
		SWIZZLE(tmp, tmp);
	}

	void VS_2_0Assembler::SAT_MASK(Operand &dst, Operand &tmp)
	{
		SAT(tmp, tmp);
		MASK(dst, tmp);
	}

	void VS_2_0Assembler::SAT_MOV_X(Operand &dst, Operand &tmp)
	{
		Operand dst_x = dst;
		dst_x.sel = xMask;

		SAT(tmp, tmp);
		MASK(dst_x, tmp);
	}

	void VS_2_0Assembler::SAT_MOV_XY(Operand &dst, Operand &tmp)
	{
		Operand dst_xy = dst;
		dst_xy.sel = xyMask;

		SAT(tmp, tmp);
		MASK(dst_xy, tmp);
	}

	void VS_2_0Assembler::SAT_MOV_XYZ(Operand &dst, Operand &tmp)
	{
		Operand dst_xyz = dst;
		dst_xyz.sel = xyzMask;

		SAT(tmp, tmp);
		MASK(dst_xyz, tmp);
	}

	void VS_2_0Assembler::SAT_MOV_XYZW(Operand &dst, Operand &tmp)
	{
		Operand dst_xyzw = dst;
		dst_xyzw.sel = xyzwMask;

		SAT(tmp, tmp);
		MASK(dst_xyzw, tmp);
	}
}