#include "PixelPipeline.hpp"

#include "Texture.hpp"

#include <float.h>

namespace swShader
{
	using namespace SoftWire;

	int PixelPipeline::x;

	float4 PixelPipeline::RHW = {1};
	float4 PixelPipeline::Z = {0};

	word4 PixelPipeline::dc_dx = {0};
	word4 PixelPipeline::dl_dx = {0};

	float4 PixelPipeline::W = {1};

	float4 PixelPipeline::t[8] = {0};
	word4 PixelPipeline::v[2] = {0};

	word4 PixelPipeline::result = {0};
	word4 PixelPipeline::current = {0};
	word4 PixelPipeline::temp = {0};

	PixelPipeline::PixelPipeline()
	{
		code = 0;

		perspectiveCorrected = false;
	}

	PixelPipeline::~PixelPipeline()
	{
	}

	void PixelPipeline::execute()
	{
		if(!code)
		{
			encode();
			if(!code) throw INTERNAL_ERROR;
		}

		code();
	}

	void (*PixelPipeline::executable())()
	{
		if(!code)
		{
			encode();
			if(!code) throw INTERNAL_ERROR;
		}

		return code;
	}

	void PixelPipeline::loadConstants()
	{
	}

	void PixelPipeline::setConstant(int index, const float value[4])
	{
		return;
	}

	void PixelPipeline::encode()
	{
		#ifndef NDEBUG
			setEchoFile("PixelPipeline.asm");
		#endif

		try
		{
			pushad();
			freeAll();

			mov(x32(&x), dword_ptr [&lx]);
			cmp(r32(&x), dword_ptr [&rx]);
			jge("return");
			{
				setupInterpolants();

				spillAll();
			label("scanlineLoop");
				{
					pixel();

					inc(r32(&x));
					cmp(r32(&x), dword_ptr [&rx]);

					spillAll();
					jnge("scanlineLoop");
				}

				emms();
			}
		label("return");
			popad();
			ret();
		}
		catch(const Error &error)
		{
			throw Error("Fatal pixel shader assembler error: ") << error;
		}
		catch(...)
		{
			throw INTERNAL_ERROR;
		}

		code = finalize();
	}

	void PixelPipeline::setupInterpolants()
	{
		annotate("setupInterpolants()");

		// Scale to 4.12 fixed-point fomat
		static const float4 scale = {0x0FFF, 0x0FFF, 0x0FFF, 0x0FFF};
		static float4 v0;
		static qword c0;
		static qword c1;

		if(FVF.hasColor() && shadingMode == SHADING_GOURAUD)
		{
			movaps(x128(&v0), xmmword_ptr [&C]);
			mulps(r128(&v0), xmmword_ptr [&scale]);
			cvttps2pi(x64(&c0), r128(&v0));
			movhlps(x128(&v0), r128(&v0));
			cvttps2pi(x64(&c1), r128(&v0));
			packssdw(r64(&c0), r64(&c1));
			psllw(r64(&c0), 4);   // Scale to 0.16 fixed-point format
			movq(qword_ptr [&v[0]], r64(&c0));

			movaps(x128(&v0), xmmword_ptr [&dC_dx]);
			mulps(r128(&v0), xmmword_ptr [&scale]);
			cvttps2pi(x64(&c0), r128(&v0));
			movhlps(x128(&v0), r128(&v0));
			cvttps2pi(x64(&c1), r128(&v0));
			packssdw(r64(&c0), r64(&c1));
			psllw(r64(&c0), 4);   // Scale to signed 0.16 fixed-point format
			movq(qword_ptr [&dc_dx], r64(&c0));
		}

		if(FVF.hasLight() && specularEnable)
		{
			movaps(x128(&v0), xmmword_ptr [&L]);
			mulps(r128(&v0), xmmword_ptr [&scale]);
			cvttps2pi(x64(&c0), r128(&v0));
			movhlps(x128(&v0), r128(&v0));
			cvttps2pi(x64(&c1), r128(&v0));
			packssdw(r64(&c0), r64(&c1));
			psllw(r64(&c0), 4);   // Scale to 0.16 fixed-point format
			movq(qword_ptr [&v[1]], r64(&c0));

			movaps(x128(&v0), xmmword_ptr [&dL_dx]);
			mulps(r128(&v0), xmmword_ptr [&scale]);
			cvttps2pi(x64(&c0), r128(&v0));
			movhlps(x128(&v0), r128(&v0));
			cvttps2pi(x64(&c1), r128(&v0));
			packssdw(r64(&c0), r64(&c1));
			psllw(r64(&c0), 4);   // Scale to signed 0.16 fixed-point format
			movq(qword_ptr [&dl_dx], r64(&c0));
		}
		
		for(int i = 0; i < FVF.textureCount(); i++)
		{
			movaps(x128(&v0), xmmword_ptr [&T[i]]);
			movaps(xmmword_ptr [&t[i]], r128(&v0));
		}

		movss(x128(&v0), dword_ptr [&w]);
		movss(dword_ptr [&RHW], r128(&v0));

		movss(x128(&v0), dword_ptr [&z]);
		movss(dword_ptr [&Z], r128(&v0));

		free(&v0);
		free(&c0);
		free(&c1);
	}

	void PixelPipeline::pixel()
	{
		depthTest();
		{
			for(int i = 0; i < 8; i++)
			{
				sampleTexture(i);
				blendTexture(i);
			}

			alphaTest();
			{
				diffusePixel();
				specularPixel();

				alphaBlend();
				writePixel();
			}
			spillAll();
		label("alphaFail");
		}
		spillAll();
	label("zFail");

		interpolate();
	}

	void PixelPipeline::depthTest()
	{
		annotate("depthTest()");

		static float4 tmp;

		movss(x128(&tmp), dword_ptr [&Z]);
		comiss(r128(&tmp), dword_ptr [r32(&depthBuffer)+4*r32(&x)]);

		spillAll();

		switch(depthCompareMode)
		{
		case DEPTH_ALWAYS:
			break;
		case DEPTH_NEVER:
			jmp("zFail");
			break;
		case DEPTH_LESS:
			jnb("zFail");
			break;
		case DEPTH_GREATEREQUAL:
			jnae("zFail");
			break;
		case DEPTH_LESSEQUAL:
			jnbe("zFail");
			break;
		case DEPTH_GREATER:
			jna("zFail");
			break;
		default:
			throw INTERNAL_ERROR;
		}

		if(depthWriteEnable)
		{
			movss(dword_ptr [r32(&depthBuffer)+4*r32(&x)], r128(&tmp));
		}

		free(&tmp);
		free(&depthBuffer);
	}

	void PixelPipeline::sampleTexture(int stage)
	{
		annotate("sampleTexture(%d)", stage);

		if(sampler[stage].stageOperation == Sampler::STAGE_DISABLE) return;
		if(sampler[stage].firstArgument != Sampler::SOURCE_TEXTURE &&
		   sampler[stage].secondArgument != Sampler::SOURCE_TEXTURE &&
	       sampler[stage].thirdArgument != Sampler::SOURCE_TEXTURE) return;

		const int mipmapOffset	= (int)&((Texture*)0)->mipmap;
		const int uFracOffset	= (int)&((Texture*)0)->uFrac;
		const int vFracOffset	= (int)&((Texture*)0)->vFrac;
		const int uIntOffset	= (int)&((Texture*)0)->uInt;
		const int vIntOffset	= (int)&((Texture*)0)->vInt;
		const int uHalfOffset	= (int)&((Texture*)0)->uHalf;
		const int vHalfOffset	= (int)&((Texture*)0)->vHalf;

		static float4 UV;

		if(FVF.textureCount() > sampler[stage].texCoordIndex)
		{
			movaps(x128(&UV), m128(&t[sampler[stage].texCoordIndex]));
		}
		else
		{
			xorps(r128(&UV), m128(&UV));
		}

		if(!perspectiveCorrected)
		{
			static float4 tmp;

			movss(x128(&tmp), dword_ptr [&RHW]);
			rcpss(r128(&W), r128(&tmp));
			mulss(r128(&tmp), r128(&W));
			mulss(r128(&tmp), r128(&W));
			addss(r128(&W), r128(&W));
			subss(r128(&W), r128(&tmp));							free(&tmp);
			shufps(r128(&W), r128(&W), 0x00);

			perspectiveCorrected = true;
		}

		mulps(r128(&UV), m128(&W));

		if(sampler[stage].addressingMode == Sampler::ADDRESSING_CLAMP)
		{
			static const float4 clampZero = {0, 0, FLT_MIN, FLT_MIN};
			static const float4 clampOne = {1, 1, FLT_MAX, FLT_MAX};

			maxps(r128(&UV), xmmword_ptr [&clampZero]);
			minps(r128(&UV), xmmword_ptr [&clampOne]);
		}

		static const float4 scale = {1 << 16, 1 << 16, 0, 1 << 16};
		static dword2 uv;

		mulps(r128(&UV), xmmword_ptr [&scale]);
		cvtps2pi(x64(&uv), r128(&UV));

		if(sampler[stage].addressingMode == Sampler::ADDRESSING_MIRROR)
		{
			word4 tmp0;
			word4 tmp1;

			movq(x64(&tmp0), r64(&uv));
			pshufw(x64(&tmp1), r64(&uv), 0xDD);
			pslld(r64(&tmp0), 15);
			pslld(r64(&tmp1), 15);
			psrad(r64(&tmp0), 31);
			psrad(r64(&tmp1), 31);
			punpckldq(r64(&tmp0), r64(&tmp1));					free(&tmp1);
			pxor(r64(&uv), r64(&tmp0));							free(&tmp0);
		}
	
		// Mipmap LOD
		static int lod;
		static int texture;
		static int buffer;
	
		shufps(r128(&UV), r128(&UV), 0xFF);
		mulss(r128(&UV), r128(&W));
		cvtss2si(x32(&lod), r128(&UV));								free(&UV);
		bsr(x32(&lod), r32(&lod));
	
		mov(x32(&texture), dword_ptr [&sampler[stage].texture]);
		mov(x32(&buffer), dword_ptr [r32(&texture)+4*r32(&lod)+mipmapOffset]);

		static int texel;

		if(sampler[stage].textureFilter == Sampler::FILTER_LINEAR)
		{
			static const qword _F_F = 0x0000FFFF0000FFFF;
			static const qword __FF = 0x00000000FFFFFFFF;

			static word4 &uuuu = (word4&)uv;
			static word4 vvvv;

			pshufw(x64(&vvvv), r64(&uv), 0xAA);
			pshufw(x64(&uuuu), r64(&uv), 0x00);

			paddw(r64(&uuuu), qword_ptr [r32(&texture)+8*r32(&lod)+uHalfOffset]);
			paddw(r64(&vvvv), qword_ptr [r32(&texture)+8*r32(&lod)+vHalfOffset]);

			static dword2 i12;   // Indexes for texel 1 & 2
			static dword2 i34;   // Indexes for texel 3 & 4
			static dword tmp;

			movq(x64(&i12), r64(&uuuu));
			movq(x64(&i34), r64(&uuuu));
			movq(x64(&tmp), r64(&vvvv));
			psrlw(r64(&tmp), qword_ptr [r32(&texture)+8*r32(&lod)+vFracOffset]);
			punpckhwd(r64(&i12), r64(&tmp));
			punpcklwd(r64(&i34), r64(&tmp));										free(&tmp);
			psrld(r64(&i12), qword_ptr [r32(&texture)+8*r32(&lod)+uFracOffset]);
			psrld(r64(&i34), qword_ptr [r32(&texture)+8*r32(&lod)+uFracOffset]);

			psllw(r64(&uuuu), qword_ptr [r32(&texture)+8*r32(&lod)+uIntOffset]);
			psllw(r64(&vvvv), qword_ptr [r32(&texture)+8*r32(&lod)+vIntOffset]);
			pxor(r64(&uuuu), qword_ptr [&_F_F]);
			pxor(r64(&vvvv), qword_ptr [&__FF]);
			pmulhuw(r64(&uuuu), r64(&vvvv));										free(&vvvv);

			static word4 &weights = uuuu;

			static word4 &c1 = current;
			static word4 c2;
			static word4 c3;
			static word4 c4;

			movd(r32(&texel), r64(&i12));
			punpcklbw(x64(&c1), qword_ptr [r32(&buffer)+r32(&texel)*4]);
			psrlq(r64(&i12), 32);
			movd(r32(&texel), r64(&i12));									free(&i12);
			punpcklbw(x64(&c2), qword_ptr [r32(&buffer)+r32(&texel)*4]);

			movd(r32(&texel), r64(&i34));
			punpcklbw(x64(&c3), qword_ptr [r32(&buffer)+r32(&texel)*4]);
			psrlq(r64(&i34), 32);		
			movd(r32(&texel), r64(&i34));									free(&i34);
			punpcklbw(x64(&c4), qword_ptr [r32(&buffer)+r32(&texel)*4]);	free(&texel);

			static word4 factor;

			pshufw(x64(&factor), r64(&weights), 0xAA);
			pmulhuw(r64(&c1), r64(&factor));
			pshufw(x64(&factor), r64(&weights), 0xFF);
			pmulhuw(r64(&c2), r64(&factor));
			pshufw(x64(&factor), r64(&weights), 0x00);
			pmulhuw(r64(&c3), r64(&factor));
			pshufw(x64(&factor), r64(&weights), 0x55);			free(&weights);
			pmulhuw(r64(&c4), r64(&factor));					free(&factor);

			paddusw(r64(&c3), r64(&c4));
			paddusw(r64(&c1), r64(&c2));
			paddusw(r64(&c1), r64(&c3));
		}
		else   // FILTER_POINT
		{
			static word4 i0;

			pshufw(x64(&i0), r64(&uv), 0xAA);
			psrlw(r64(&i0), qword_ptr [r32(&texture)+8*r32(&lod)+vFracOffset]);
			punpcklwd(r64(&uv), r64(&i0));											free(&i0);
			psrld(r64(&uv), qword_ptr [r32(&texture)+8*r32(&lod)+uFracOffset]);

			movd(r32(&texel), r64(&uv));
			punpcklbw(x64(&current), qword_ptr [r32(&buffer)+r32(&texel)*4]);		free(&texel);
		}

		free(&lod);
		free(&texture);
		free(&buffer);

		// Scale to 1.3.12 fixed-point format
		psrlw(r64(&current), 4);
	}

	void PixelPipeline::blendTexture(int stage)
	{
		annotate("blendTexture(%d)", stage);

		if(sampler[stage].stageOperation == Sampler::STAGE_DISABLE) return;

		word4 *firstArgument = &result;
		word4 *secondArgument = &current;
		word4 *thirdArgument = &v[0];
		word4 *destArgument = &result;

		static word4 zero = {0, 0, 0, 0};

		switch(sampler[stage].firstArgument)
		{
		case Sampler::SOURCE_VOID:		firstArgument = &zero;										break;
		case Sampler::SOURCE_TEXTURE:	firstArgument = &current;									break;
		case Sampler::SOURCE_CONSTANT:	firstArgument = (word4*)&sampler[stage].constantColorShort;	break;
		case Sampler::SOURCE_CURRENT:	firstArgument = &result;									break;
		case Sampler::SOURCE_DIFFUSE:	firstArgument = &v[0];										break;
		case Sampler::SOURCE_SPECULAR:	firstArgument = &v[1];										break;
		case Sampler::SOURCE_TEMP:		firstArgument = &temp;										break;
		default:
			throw INTERNAL_ERROR;
		}

		switch(sampler[stage].secondArgument)
		{
		case Sampler::SOURCE_VOID:		secondArgument = &zero;											break;
		case Sampler::SOURCE_TEXTURE:	secondArgument = &current;										break;
		case Sampler::SOURCE_CONSTANT:	secondArgument = (word4*)&sampler[stage].constantColorShort;	break;
		case Sampler::SOURCE_CURRENT:	secondArgument = &result;										break;
		case Sampler::SOURCE_DIFFUSE:	secondArgument = &v[0];											break;
		case Sampler::SOURCE_SPECULAR:	secondArgument = &v[1];											break;
		case Sampler::SOURCE_TEMP:		secondArgument = &temp;											break;
		default:
			throw INTERNAL_ERROR;
		}

		switch(sampler[stage].thirdArgument)
		{
		case Sampler::SOURCE_VOID:		thirdArgument = &zero;										break;
		case Sampler::SOURCE_TEXTURE:	thirdArgument = &current;									break;
		case Sampler::SOURCE_CONSTANT:	thirdArgument = (word4*)&sampler[stage].constantColorShort;	break;
		case Sampler::SOURCE_CURRENT:	thirdArgument = &result;									break;
		case Sampler::SOURCE_DIFFUSE:	thirdArgument = &v[0];										break;
		case Sampler::SOURCE_SPECULAR:	thirdArgument = &v[1];										break;
		case Sampler::SOURCE_TEMP:		thirdArgument = &temp;										break;
		default:
			throw INTERNAL_ERROR;
		}

		switch(sampler[stage].destinationArgument)
		{
		case Sampler::DESTINATION_CURRENT:	destArgument = &result;	break;
		case Sampler::DESTINATION_TEMP:		destArgument = &temp;	break;
		default:
			throw INTERNAL_ERROR;
		}

		// 0.5 in 1.3.12 fixed-point fomat
		static const word4 half = {(1 << 12) / 2, (1 << 12) / 2, (1 << 12) / 2, (1 << 12) / 2};
		static const word4 neg = {-1, -1, -1, -1};
		const static word4 inv = {0x0FFF, 0x0FFF, 0x0FFF, 0x0FFF};

		word4 modArg1;
		word4 modArg2;
		word4 modArg3;

		switch(sampler[stage].firstModifier)
		{
		case Sampler::MODIFIER_COLOR:
			break;
		case Sampler::MODIFIER_INVCOLOR:
			movq(x64(&modArg1), m64(firstArgument));
			pxor(r64(&modArg1), qword_ptr [&inv]);
			firstArgument = &modArg1;
			break;
		case Sampler::MODIFIER_ALPHA:
			movq(x64(&modArg1), m64(firstArgument));
			pshufw(x64(&modArg1), r64(&modArg1), 0x00);
			firstArgument = &modArg1;
			break;
		case Sampler::MODIFIER_INVALPHA:
			movq(x64(&modArg1), m64(firstArgument));
			pshufw(x64(&modArg1), r64(&modArg1), 0x00);
			pxor(r64(&modArg1), qword_ptr [&inv]);
			firstArgument = &modArg1;
			break;
		default:
			throw INTERNAL_ERROR;
		}

		switch(sampler[stage].secondModifier)
		{
		case Sampler::MODIFIER_COLOR:
			break;
		case Sampler::MODIFIER_INVCOLOR:
			movq(x64(&modArg2), m64(secondArgument));
			pxor(r64(&modArg2), qword_ptr [&inv]);
			secondArgument = &modArg2;
			break;
		case Sampler::MODIFIER_ALPHA:
			movq(x64(&modArg2), m64(secondArgument));
			pshufw(x64(&modArg2), r64(&modArg2), 0x00);
			secondArgument = &modArg2;
			break;
		case Sampler::MODIFIER_INVALPHA:
			movq(x64(&modArg2), m64(secondArgument));
			pshufw(x64(&modArg2), r64(&modArg2), 0x00);
			pxor(r64(&modArg2), qword_ptr [&inv]);
			secondArgument = &modArg2;
			break;
		default:
			throw INTERNAL_ERROR;
		}

		switch(sampler[stage].thirdModifier)
		{
		case Sampler::MODIFIER_COLOR:
			break;
		case Sampler::MODIFIER_INVCOLOR:
			movq(x64(&modArg3), m64(thirdArgument));
			pxor(r64(&modArg3), qword_ptr [&inv]);
			thirdArgument = &modArg3;
			break;
		case Sampler::MODIFIER_ALPHA:
			movq(x64(&modArg3), m64(thirdArgument));
			pshufw(x64(&modArg3), r64(&modArg3), 0x00);
			thirdArgument = &modArg3;
			break;
		case Sampler::MODIFIER_INVALPHA:
			movq(x64(&modArg3), m64(thirdArgument));
			pshufw(x64(&modArg3), r64(&modArg3), 0x00);
			pxor(r64(&modArg3), qword_ptr [&inv]);
			thirdArgument = &modArg3;
			break;
		default:
			throw INTERNAL_ERROR;
		}

		static word4 tmp;

		word4 &arg1 = *firstArgument;
		word4 &arg2 = *secondArgument;
		word4 &arg3 = *thirdArgument;
		word4 &res = *destArgument;

		// Convert from 0.12 to 1.3.12 fixed-point format
		if(sampler[stage].firstArgument  == Sampler::SOURCE_DIFFUSE)  {psrlw(r64(&arg1), 4);}
		if(sampler[stage].secondArgument == Sampler::SOURCE_DIFFUSE)  {psrlw(r64(&arg2), 4);}
		if(sampler[stage].thirdArgument  == Sampler::SOURCE_DIFFUSE)  {psrlw(r64(&arg3), 4);}

		if(sampler[stage].firstArgument  == Sampler::SOURCE_SPECULAR) {psrlw(r64(&arg1), 4);}
		if(sampler[stage].secondArgument == Sampler::SOURCE_SPECULAR) {psrlw(r64(&arg2), 4);}
		if(sampler[stage].thirdArgument  == Sampler::SOURCE_SPECULAR) {psrlw(r64(&arg3), 4);}

		switch(sampler[stage].stageOperation)
		{
		case Sampler::STAGE_DISABLE:					// Void
			throw INTERNAL_ERROR;   // Should have already returned
			break;
		case Sampler::STAGE_REPLACE:					// Arg1
			movq(x64(&res), m64(&arg1));
			break;
		case Sampler::STAGE_SELECTARG1:					// Arg1
			movq(x64(&res), m64(&arg1));
			break;
		case Sampler::STAGE_SELECTARG2:					// Arg2
			movq(x64(&res), m64(&arg2));
			break;
		case Sampler::STAGE_SELECTARG3:					// Arg3
			movq(x64(&res), m64(&arg3));
			break;
		case Sampler::STAGE_MODULATE:					// Arg1 * Arg2
			if(&res != &arg2)
			{
				movq(x64(&res), m64(&arg1));
				pmulhw(r64(&res), m64(&arg2));
			}
			else
			{
				movq(x64(&res), m64(&arg2));
				pmulhw(r64(&res), m64(&arg1));				
			}
			psllw(r64(&res), 4);
			break;
		case Sampler::STAGE_MODULATE2X:					// Arg1 * Arg2 * 2
			if(&res != &arg2)
			{
				movq(x64(&res), m64(&arg1));
				pmulhw(r64(&res), m64(&arg2));
			}
			else
			{
				movq(x64(&res), m64(&arg2));
				pmulhw(r64(&res), m64(&arg1));				
			}
			psllw(r64(&res), 5);
			break;
		case Sampler::STAGE_MODULATE4X:					// Arg1 * Arg2 * 4
			if(&res != &arg2)
			{
				movq(x64(&res), m64(&arg1));
				pmulhw(r64(&res), m64(&arg2));
			}
			else
			{
				movq(x64(&res), m64(&arg2));
				pmulhw(r64(&res), m64(&arg1));				
			}
			psllw(r64(&res), 6);
			break;
		case Sampler::STAGE_ADD:						// Arg1 + Arg2
			if(&res != &arg2)
			{
				movq(x64(&res), m64(&arg1));
				paddw(r64(&res), m64(&arg2));
			}
			else
			{
				movq(x64(&res), m64(&arg2));
				paddw(r64(&res), m64(&arg1));
			}
			break;
		case Sampler::STAGE_ADDSIGNED:					// Arg1 + Arg2 - 0.5
			if(&res != &arg2)
			{
				movq(x64(&res), m64(&arg1));
				paddw(r64(&res), m64(&arg2));
			}
			else
			{
				movq(x64(&res), m64(&arg2));
				paddw(r64(&res), m64(&arg1));
			}
			psubw(r64(&res), qword_ptr [&half]);
			break;
		case Sampler::STAGE_SUBTRACT:					// Arg1 - Arg2
			if(&res != &arg2 && &res != &arg1)
			{
				movq(x64(&res), m64(&arg1));
				psubw(r64(&res), m64(&arg2));
			}
			else if(&res != &arg1)   // Res = Arg1 - Res
			{
				pxor(r64(&res), qword_ptr [&neg]);
				paddw(r64(&res), m64(&arg1));
			}
			else
			{
				pxor(r64(&res), m64(&res));
			}
			break;
		case Sampler::STAGE_MULTIPLYADD:				// Arg1 + Arg2 * Arg3
			if(&res != &arg3 && &res != &arg1)
			{
				movq(x64(&res), m64(&arg2));
				pmulhw(r64(&res), m64(&arg3));
				psllw(r64(&res), 4);
				paddw(r64(&res), m64(&arg1));
			}
			else   // Opportunity for further optimization
			{
				movq(x64(&tmp), m64(&arg2));
				pmulhw(r64(&tmp), m64(&arg3));
				psllw(r64(&tmp), 4);
				paddw(r64(&tmp), m64(&arg1));
				movq(m64(&res), r64(&tmp));
			}
			break;
		case Sampler::STAGE_LERP:						// Arg1 * (Arg2 - Arg3) + Arg3
			if(&res != &arg3 && &res != &arg1)
			{
				movq(x64(&res), m64(&arg2));
				psubw(r64(&res), m64(&arg3));
				pmulhw(r64(&res), m64(&arg1));
				psllw(r64(&res), 4);
				paddw(r64(&res), m64(&arg3));
			}
			else   // Opportunity for further optimization
			{
				movq(x64(&tmp), m64(&arg2));
				psubw(r64(&tmp), m64(&arg3));
				pmulhw(r64(&tmp), m64(&arg1));
				psllw(r64(&tmp), 4);
				paddw(r64(&tmp), m64(&arg3));
				movq(m64(&res), r64(&tmp));
			}
			break;
		case Sampler::STAGE_DOT3:						// Arg1r * Arg2r + Arg1g * Arg2g + Arg1b * Arg2b
			if(&res != &arg2)
			{
				movq(x64(&res), m64(&arg1));
				pmaddwd(r64(&res), m64(&arg2));
				psllw(r64(&res), 4);
				pshufw(x64(&tmp), m64(&res), 0x4E);
				paddw(r64(&res), r64(&tmp));
				packssdw(r64(&res), r64(&res));
			}
			else
			{
				movq(x64(&res), m64(&arg2));
				pmaddwd(r64(&res), m64(&arg1));
				psllw(r64(&res), 4);
				pshufw(x64(&tmp), m64(&res), 0x4E);
				paddw(r64(&res), r64(&tmp));
				packssdw(r64(&res), r64(&res));
			}
			break;
		default:
			throw INTERNAL_ERROR;
		}

		free(&tmp);

		free(&modArg1);
		free(&modArg2);
		free(&modArg3);
	}

	void PixelPipeline::alphaTest()
	{
		annotate("alphaTest()");

		if(!alphaTestEnable) return;

		static int alpha;

		movd(x32(&alpha), r64(&result));
		movsx(x32(&alpha), r16(&alpha));
		cmp(r32(&alpha), dword_ptr [&alphaReference]);

		spillAll();

		switch(alphaCompareMode)
		{
		case ALPHA_ALWAYS:
			break;
		case ALPHA_NEVER:
			jmp("alphaFail");
			break;
		case ALPHA_LESS:
			jnb("alphaFail");
			break;
		case ALPHA_GREATEREQUAL:
			jnae("alphaFail");
			break;
		case ALPHA_LESSEQUAL:
			jnbe("alphaFail");
			break;
		case ALPHA_GREATER:
			jna("alphaFail");
			break;
		default:
			throw INTERNAL_ERROR;
		}
	}

	void PixelPipeline::diffusePixel()
	{
		annotate("diffusePixel()");

		if(!FVF.hasDiffuse() || shadingMode == SHADING_NONE) return;

		pmulhuw(r64(&result), m64(&v[0]));
	}

	void PixelPipeline::specularPixel()
	{
		annotate("specularPixel()");
			
		if(!FVF.hasDiffuse() || !specularEnable) return;

		static word4 specular;

		movq(r64(&specular), m64(&v[1]));
		psrlw(r64(&specular), 4);
		paddusw(r64(&result), m64(&specular));
	}

	void PixelPipeline::alphaBlend()
	{
		annotate("alphaBlend()");

		if(!alphaBlendEnable) return;

		static word4 pixel;

		if(destBlendFactor != BLEND_ZERO)
		{
			// Read pixel
			switch(colorDepth)
			{
			case COLOR_B8G8R8A8:
				movd(r64(&pixel), dword_ptr [r32(&colorBuffer)+4*r32(&x)]);
				punpcklbw(x64(&pixel), r64(&pixel));
				psllw(r64(&pixel), 4);
				break;
			default:
				throw Error("Target color depth (%d) not supported", colorDepth);
			}
		}

		static word4 sourceFactor;
		static word4 destFactor;

		const static word4 one = {1 << 12, 1 << 12, 1 << 12, 1 << 12};   // 1.0 in 1.3.12 fixed-point format
		const static word4 inv = {0x0FFF, 0x0FFF, 0x0FFF, 0x0FFF};

		switch(sourceBlendFactor)
		{
		case BLEND_ZERO:
		//	pxor(r64(&sourceFactor), m64(&sourceFactor));   // Optimized
			break;
		case BLEND_ONE:
		//	movq(x64(&sourceFactor), qword_ptr [&one]);   // Optimized
			break;
		case BLEND_SOURCE:
			movq(x64(&sourceFactor), m64(&result));
			break;
		case BLEND_INVSOURCE:
			movq(x64(&sourceFactor), m64(&result));
			pxor(r64(&sourceFactor), qword_ptr [&inv]);
			break;
		case BLEND_DEST:
			movq(x64(&sourceFactor), m64(&pixel));
			break;
		case BLEND_INVDEST:
			movq(x64(&sourceFactor), m64(&pixel));
			pxor(r64(&sourceFactor), qword_ptr [&inv]);
			break;
		case BLEND_SOUCEALPHA:
			movq(x64(&sourceFactor), m64(&result));
			pshufw(x64(&sourceFactor), r64(&sourceFactor), 0x00);
			break;
		case BLEND_INVSOURCEALPHA:
			movq(x64(&sourceFactor), m64(&result));
			pshufw(x64(&sourceFactor), r64(&sourceFactor), 0x00);
			pxor(r64(&sourceFactor), qword_ptr [&inv]);
			break;
		case BLEND_DESTALPHA:
			movq(x64(&sourceFactor), m64(&pixel));
			pshufw(x64(&sourceFactor), r64(&sourceFactor), 0x00);
			break;
		case BLEND_INVDESTALPHA:
			movq(x64(&sourceFactor), m64(&pixel));
			pshufw(x64(&sourceFactor), r64(&sourceFactor), 0x00);
			pxor(r64(&sourceFactor), qword_ptr [&inv]);
			break;
		default:
			throw INTERNAL_ERROR;
		}

		switch(destBlendFactor)
		{
		case BLEND_ZERO:
		//	pxor(r64(&destFactor), m64(&destFactor));   // Optimized
			break;
		case BLEND_ONE:
		//	movq(x64(&destFactor), qword_ptr [&one]);	// Optimized
			break;
		case BLEND_SOURCE:
			movq(x64(&destFactor), m64(&result));
			break;
		case BLEND_INVSOURCE:
			movq(x64(&destFactor), m64(&result));
			pxor(r64(&destFactor), qword_ptr [&inv]);
			break;
		case BLEND_DEST:
			movq(x64(&destFactor), m64(&pixel));
			break;
		case BLEND_INVDEST:
			movq(x64(&destFactor), m64(&pixel));
			pxor(r64(&destFactor), qword_ptr [&inv]);
			break;
		case BLEND_SOUCEALPHA:
			movq(x64(&destFactor), m64(&result));
			pshufw(x64(&destFactor), r64(&sourceFactor), 0x00);
			break;
		case BLEND_INVSOURCEALPHA:
			movq(x64(&destFactor), m64(&result));
			pshufw(x64(&destFactor), r64(&destFactor), 0x00);
			pxor(r64(&destFactor), qword_ptr [&inv]);
			break;
		case BLEND_DESTALPHA:
			movq(x64(&destFactor), m64(&pixel));
			pshufw(x64(&destFactor), r64(&destFactor), 0x00);
			break;
		case BLEND_INVDESTALPHA:
			movq(x64(&destFactor), m64(&pixel));
			pshufw(x64(&destFactor), r64(&destFactor), 0x00);
			pxor(r64(&destFactor), qword_ptr [&inv]);
			break;
		default:
			throw INTERNAL_ERROR;
		}

		if(sourceBlendFactor != BLEND_ONE)
		{
			if(sourceBlendFactor != BLEND_ZERO)
			{
				pmulhw(r64(&result), m64(&sourceFactor));
				psllw(r64(&result), 4);
			}
			else
			{
				pxor(r64(&result), m64(&result));
			}
		}

		if(destBlendFactor != BLEND_ZERO)
		{
			if(destBlendFactor != BLEND_ONE)
			{
				pmulhw(r64(&pixel), m64(&destFactor));
				psllw(r64(&pixel), 4);
			}

			paddw(r64(&result), m64(&pixel));
		}
	}

	void PixelPipeline::writePixel()
	{
		annotate("writePixel()");

		switch(colorDepth)
		{
		case COLOR_B8G8R8A8:
			psraw(r64(&result), 4);
			pshufw(x64(&result), r64(&result), 0xC6);   // RGBA -> BGRA
			packuswb(r64(&result), r64(&result));
			movd(dword_ptr [r32(&colorBuffer)+4*r32(&x)], r64(&result));
			break;
		default:
			throw Error("Target color depth (%d) not supported", colorDepth);
		}
	}

	void PixelPipeline::interpolate()
	{
		annotate("interpolate()");

		if(FVF.hasColor() && shadingMode == SHADING_GOURAUD)
		{
			paddw(r64(&v[0]), qword_ptr [&dc_dx]);
		}

		if(FVF.hasLight() && specularEnable)
		{
			paddw(r64(&v[1]), qword_ptr [&dl_dx]);
		}

		// Texture coordinate interpolation
		for(int i = 0; i < FVF.textureCount(); i++)
		{
			if(sampler[i].stageOperation != Sampler::STAGE_DISABLE)
			{
				addps(r128(&t[i]), xmmword_ptr [&dT_dx[i]]);
			}
		}

		addss(r128(&RHW), dword_ptr[&dw_dx]);
		addss(r128(&Z), dword_ptr[&dz_dx]);

		spillAll();
	}
}