//
// LiDIA - a library for computational number theory
//   Copyright (c) 1994, 1995 by the LiDIA Group
//
// File        : fft.c
// Author      : Victor Shoup, Thomas Pfahler (TPf)
// Last change : TPf, Feb 29, 1996, initial version
//


#if defined(HAVE_MAC_DIRS) || defined(__MWERKS__)

#include <LiDIA:Fp_polynomial_fft.h>
#include <LiDIA:random.h>
#include <LiDIA:base_vector.h>
#include <LiDIA:udigit.h>

#else

#include <LiDIA/Fp_polynomial_fft.h>
#include <LiDIA/random.h>
#include <LiDIA/base_vector.h>
#include <LiDIA/udigit.h>

#endif


#define FFT_PIPELINE 1
// * an experimental optimization---implements a software pipeline
//   that on some machines is faster *


//#undef LIDIA_MUL_MOD_SPECIAL
#define LIDIA_MUL_MOD_SPECIAL 1

// If LIDIA_MUL_MOD_SPECIAL is defined, two more functions are
// defined (otherwise, udigit_multiply_mod is used):
//
// long mul_mod_sp(long a, long b, long q, double qinv)
//    returns a*b mod q
//    qinv must be  1/double(q)
//    
// long mul_mod_sp2(long a, long b, long q, double bqinv)
//    returns a*b mod q
//    bqinv must be  double(b)/double(q)
//
// Since their implementation is based on double-arithmetic,
// long-arguments must not be greater than 2^26 !!

#ifdef LIDIA_MUL_MOD_SPECIAL
inline
long mul_mod_sp(long a, long b, long p, double pinv)
{
    double ab = ((double) a) * ((double) b);
    register long q = (long) (ab * pinv);
    register long res = (long) (ab - (((double) q) * ((double) p)));
//    res += (res >> ((SIZEOF_LONG*8)-1)) & p;
//    res -= p;
//    res += (res >> ((SIZEOF_LONG*8)-1)) & p;

    if (res >= p) res -= p;
    else if (res < 0) res += p;
    return res;
}

inline
long mul_mod_sp2(long a, long b, long p, double bpinv)
{
    double ab = ((double) a) * ((double) b);
    register long q = (long) (a* bpinv);
    register long res = (long) (ab - (((double) q) * ((double) p)));
//    res += (res >> ((SIZEOF_LONG*8)-1)) & p;
//    res -= p;
//    res += (res >> ((SIZEOF_LONG*8)-1)) & p;

    if (res >= p) res -= p;
    else if (res < 0) res += p;
    return res;
}
#endif // LIDIA_MUL_MOD_SPECIAL





//REMARK: to delete[] a null pointer is not an error !


/***************************************************************
                    class fft_table
***************************************************************/
					
	/***********************************
	 class fft_table::bit_reverse_table
	***********************************/

fft_table::bit_reverse_table fft_table::BitReverse;


fft_table::bit_reverse_table::~bit_reverse_table()
{
    debug_handler( "fft_table::bit_reverse_table", "destructor" );
    if (mem != 0)
    {
	for (lidia_size_t i = 0; i < allocated; i++)
	    delete[] mem[i];
    }
    delete[] mem;
}

lidia_size_t fft_table::bit_reverse_table::rev_inc(lidia_size_t a, lidia_size_t k)
//increases 'a' in "bitreverse order" (size: 'k' bits)
{
    debug_handler( "fft_table::bit_reverse_table", "rev_inc( lidia_size_t, lidia_size_t )" );
    lidia_size_t j, m;
    j = k; m = 1 << (k-1);
    while (j && (m & a))
    {
	a ^= m;
	m >>= 1;
	j--;
    }
    if (j)
	a ^= m;
    return a;
}


lidia_size_t* fft_table::bit_reverse_table::table( lidia_size_t k)
//returns mem[k], initializes if mem[k]==0 or k>allocated
{
    debug_handler( "fft_table::bit_reverse_table", "table( lidia_size_t )" );
    lidia_size_t *rev, i, j;
	
    if (k >= allocated)
    {	//enlarge and copy old values
	lidia_size_t** new_mem = new lidia_size_t*[k+1];
	if (!k) lidia_error_handler( "fft_table::bit_reverse_table", "table( lidia_size_t )::out of memory" );

	lidia_size_t **op(mem), **np(new_mem);
	for (i = allocated; i != 0; i--, op++, np++)
	    *np = *op;
	//for (i = 0; i < allocated; i++)
	//new_mem[i] = mem[i];
	
	for (i = allocated; i <= k; i++, np++)
	    *np = 0;		//new_mem[i] = 0;

	allocated = k+1;
	delete[] mem;
	mem = new_mem;
    }
	
    rev = mem[k];
    if (rev == 0)
    {//not initialized yet
	lidia_size_t n = 1 << k;
	rev = mem[k] = new lidia_size_t[n];
	if (!rev) lidia_error_handler( "fft_table::bit_reverse_table", "table( lidia_size_t )::out of memory" );

	lidia_size_t *p(rev);
	for (i=n, j=0;  i!=0;  i--, p++, j = rev_inc(j,k))
	    *p = j;
	//for (i = 0, j = 0; i < n; i++, j = rev_inc(j, k))
	//    rev[i] = j;
    }
    return rev;
}


void fft_table::bit_reverse_table::copy(sdigit* A, const sdigit* a, lidia_size_t k)
{
    debug_handler( "fft_table::bit_reverse_table", "copy( sdigit*, sdigit*, lidia_size_t )" );

    switch(k)
    {
	case 0: A[0] = a[0]; break;
	case 1: A[0] = a[0]; A[1] = a[1]; break;
	case 2: A[0] = a[0]; A[2] = a[1]; A[1] = a[2]; A[3] = a[3]; break;
	case 3: A[0] = a[0]; A[4] = a[1]; A[2] = a[2]; A[6] = a[3];
		A[1] = a[4]; A[5] = a[5]; A[3] = a[6]; A[7] = a[7]; break;
	case 4: A[0] = a[0]; A[8] = a[1]; A[4] = a[2]; A[12] = a[3];
		A[2] = a[4]; A[10] = a[5]; A[6] = a[6]; A[14] = a[7];
		A[1] = a[8]; A[9] = a[9]; A[5] = a[10]; A[13] = a[11];
		A[3] = a[12]; A[11] = a[13]; A[7] = a[14]; A[15] = a[15]; break;
	case 5: A[0] = a[0]; A[16] = a[1]; A[8] = a[2]; A[24] = a[3];
		A[4] = a[4]; A[20] = a[5]; A[12] = a[6]; A[28] = a[7];
		A[2] = a[8]; A[18] = a[9]; A[10] = a[10]; A[26] = a[11];
		A[6] = a[12]; A[22] = a[13]; A[14] = a[14]; A[30] = a[15];
		A[1] = a[16]; A[17] = a[17]; A[9] = a[18]; A[25] = a[19];
		A[5] = a[20]; A[21] = a[21]; A[13] = a[22]; A[29] = a[23];
		A[3] = a[24]; A[19] = a[25]; A[11] = a[26]; A[27] = a[27];
		A[7] = a[28]; A[23] = a[29]; A[15] = a[30]; A[31] = a[31]; break;
	case 6: A[0] = a[0]; A[32] = a[1]; A[16] = a[2]; A[48] = a[3];
		A[8] = a[4]; A[40] = a[5]; A[24] = a[6]; A[56] = a[7];
		A[4] = a[8]; A[36] = a[9]; A[20] = a[10]; A[52] = a[11];
		A[12] = a[12]; A[44] = a[13]; A[28] = a[14]; A[60] = a[15];
		A[2] = a[16]; A[34] = a[17]; A[18] = a[18]; A[50] = a[19];
		A[10] = a[20]; A[42] = a[21]; A[26] = a[22]; A[58] = a[23];
		A[6] = a[24]; A[38] = a[25]; A[22] = a[26]; A[54] = a[27];
		A[14] = a[28]; A[46] = a[29]; A[30] = a[30]; A[62] = a[31]; 
		A[1] = a[32]; A[33] = a[33]; A[17] = a[34]; A[49] = a[35];
		A[9] = a[36]; A[41] = a[37]; A[25] = a[38]; A[57] = a[39];
		A[5] = a[40]; A[37] = a[41]; A[21] = a[42]; A[53] = a[43];
		A[13] = a[44]; A[45] = a[45]; A[29] = a[46]; A[61] = a[47];
		A[3] = a[48]; A[35] = a[49]; A[19] = a[50]; A[51] = a[51];
		A[11] = a[52]; A[43] = a[53]; A[27] = a[54]; A[59] = a[55];
		A[7] = a[56]; A[39] = a[57]; A[23] = a[58]; A[55] = a[59];
		A[15] = a[60]; A[47] = a[61]; A[31] = a[62]; A[63] = a[63]; break;

	default :
		lidia_size_t n, i;
		const lidia_size_t *rev1, *rev2;
		const sdigit *a2;
		n = (1 << (k-1));
		rev1 = table(k);
		rev2 = rev1 + n;
		a2 = a + n;
		
		//for (i = 0, j=n; i < n; i++,j++,a++, a2++,rev1++,rev2++)
		//{ A[rev[j]] = a[j];
		//  A[rev[i]] = a[i]; }
		for (i = n; i!=0; i--,a++, a2++,rev1++,rev2++)
		{
		    A[*rev1] = *a;
		    A[*rev2] = *a2;
		}
    }
}




	/***********************************
	 class fft_table::fft_crt_table
	***********************************/

fft_table::fft_crt_table::fft_crt_table(sdigit* primevec, lidia_size_t length, const mcp& m) :
	mod(m),
	next(0)
{
    debug_handler( "fft_table::fft_crt_table", "fft_crt_table( sdigit*, lidia_size_t, mcp& )" );
    base_vector<sdigit> VEC(primevec,length);		//###copy vector
    CT.init(VEC);
	//I'd prefer (if it were possible):
	//CT.init(primevec, length, m.mod());
}



					
/***************************************************************
					class fft_table
***************************************************************/

sdigit fft_table::next_prime(sdigit old) const
    // returns q where q is less than (old % 2^max_degree)+1 and prime,
    // q = 1 mod 2^max_degree
    // error if no more primes are found.
{
    debug_handler( "fft_table", "next_prime( sdigit )" );
    sdigit cand;
    sdigit N = 1 << max_degree;

    if (old == ~0)
    {
#ifndef LIDIA_MUL_MOD_SPECIAL
	cand = max_udigit();
	if (cand < 0) cand = udigit(cand) >> 1;
	//max_udigit() migth be 2^32-1 "=" -1
#else
	cand = comparator<udigit>::min((1 << 26) - 1, max_udigit());
	//our special version only works for udigits with less than 26 bits
#endif

	cand >>= max_degree;
	cand <<= max_degree;
	if (max_degree!=0)
	    cand++;			//make cand = 1 mod 2^max_degree
    }
    else
    {
	cand = old;
	if (cand % N != 1)
	    lidia_error_handler( "next_prime", "not == 1 mod 2^maxroot" );
    }
    for (;;)
    {
	if (cand <= N)	// <= (1 << max_degree)) 
	    lidia_error_handler( "fft_table", "next_prime( sdigit )::no more primes" );
	cand -= N;	//cand = cand - (1 << max_degree);
	if (!is_prime((bigint)cand,8))
	    continue;
	return  cand;
    }
}


fft_table::fft_table(lidia_size_t l) :
	FFT_primes(0),
	num_FFT_primes(0),
	max_degree(l),
	RootTable(0),
	RootInvTable(0),
	TwoInvTable(0),
	CRT_table_list(0),
	reference_counter(0),
	next(0)
{
    debug_handler( "fft_table", "fft_table( lidia_size_t )" );
}

	
fft_table::~fft_table()
{
    debug_handler( "fft_table", "destructor" );

    if (reference_counter != 0)
	lidia_error_handler( "fft_table", "destructor::reference counter is not zero" );

    for (lidia_size_t i = 0; i < num_FFT_primes; i++)
    {
	delete[] RootTable[i];
	delete[] RootInvTable[i];
	delete[] TwoInvTable[i];
    }

    delete[] RootTable;
    delete[] RootInvTable;
    delete[] TwoInvTable;
    delete[] FFT_primes;


    fft_crt_table* p = CRT_table_list;
    while (p != 0)
    {
	CRT_table_list = CRT_table_list->next;
	delete p;
	p = CRT_table_list;
    }
}


fft_table::fft_crt_table* fft_table::use_fft_table(const mcp& m)
	//checks if a crt_table for m is already initialized; if not, enlarges
	//prime vector (if necessary) and initializes a new crt_table
{
    debug_handler( "fft_table", "use_fft_table( mcp& )" );
	
    if (m.mod().is_zero())	//I hope this will never happen
	lidia_error_handler( "fft_table", "use_fft_table( const mcp& )::modulus == 0" );

    fft_crt_table* p = CRT_table_list;

    while (p != 0)
    {
	if (p->mod == m)
	    return p;		//found a suitable crt_table
	p = p->next;
    }
//nothing suitable found

//the product of all primes must be > 2^max_degree * modulus^2
//we do not multiply the primes explicitely, but compute logarithms
//so we simply have to add bit_lengths instead of multiplying large numbers
    lidia_size_t b_log = max_degree + 2*(m.mod().bit_length()-1) + 2;
		    // = log_2(2^max_degree * modulus^2) + safety
	
    lidia_size_t sum(0), i;
    for (i = 0; i < num_FFT_primes; i++)
    {
	sum += integer_log(FFT_primes[i]);
	if (sum > b_log)
	{	//no new primes needed
	    p = new fft_crt_table(FFT_primes, i+1, m);
	    p->next = CRT_table_list;
	    return p;
	}
    }

//need more primes
    lidia_size_t old_num_FFT_primes = num_FFT_primes;
    seek_primes(b_log, sum);
	
    //now, FFT_primes and num_FFT_primes have new values
	
    sdigit **root = new sdigit*[num_FFT_primes];
    sdigit **rootinv = new sdigit*[num_FFT_primes];
    sdigit **twoinv = new sdigit*[num_FFT_primes];
    if (!root || !rootinv || !twoinv)
	lidia_error_handler( "fft_table", "use_FFT_info( const mcp& )::out of space" );
	
    if (old_num_FFT_primes != 0)
    {//copy old pointers
	for (i=0; i<old_num_FFT_primes; i++)
	{
	    root[i] = RootTable[i];
	    rootinv[i] = RootInvTable[i];
	    twoinv[i] = TwoInvTable[i];
	}
	delete[] RootTable;
	delete[] RootInvTable;
	delete[] TwoInvTable;
    }
    RootTable = root;
    RootInvTable = rootinv;
    TwoInvTable = twoinv;
    //now, FFT_primes, RootTable, RootInvTable, TwoInvTable have correct length


//compute RootTable, RootInvTable, TwoInvTable for new primes
    lidia_size_t index, j, jj;
    sdigit *rt, *rit, *tit, q, w, t;
 
    for (index = old_num_FFT_primes; index < num_FFT_primes; index++)
    {
	if (!(rt = RootTable[index] = new sdigit[max_degree+1]))
	    lidia_error_handler( "fft_table", "use_FFT_info( const mcp& )::out of space" );
       	if (!(rit = RootInvTable[index] = new sdigit[max_degree+1]))
	    lidia_error_handler( "fft_table", "use_FFT_info( const mcp )::out of space" );
	if (!(tit = TwoInvTable[index] = new sdigit[max_degree+1]))
	    lidia_error_handler( "fft_table", "use_FFT_info( const mcp )::out of space" );

       	q = FFT_primes[index];
	do
       	{//look for primitive root of unity
	    w =  random() % q;
        }while (udigit_power_mod(w, q >> 1, q) != q-1);
		
	w = udigit_power_mod(w, q >> max_degree, q);
	//w is a primitive 2^max_degree root of unity
	
       	rt[max_degree] = w;
	rit[max_degree] = udigit_invert_mod(w, q);
	tit[0] = 1;
       	t = udigit_invert_mod((udigit)2, q);
#ifdef LIDIA_MUL_MOD_SPECIAL
	double qinv = 1.0/((double)q);
#endif
	for (j = max_degree-1, jj = 1; j >= 0; j--, jj++)
       	{
#ifndef LIDIA_MUL_MOD_SPECIAL
	    rt[j] = udigit_multiply_mod(rt[j+1], rt[j+1], q);
	    rit[j] = udigit_multiply_mod(rit[j+1], rit[j+1], q);
	    tit[jj] = udigit_multiply_mod(tit[jj-1], t, q);
#else
	    rt[j] = mul_mod_sp(rt[j+1], rt[j+1], q, qinv);
	    rit[j] = mul_mod_sp(rit[j+1], rit[j+1], q, qinv);
	    tit[jj] = mul_mod_sp(tit[jj-1], t, q, qinv);
#endif
	}
    }


//insert new fft_crt_table 
    p = new fft_crt_table(FFT_primes, num_FFT_primes, m);
    p->next = CRT_table_list;
    CRT_table_list = p;


    debug_handler_c( "fft_table", "use_fft_table", 1, 
	cout<<"maxroot = "<<max_degree<<"\n ["; 
	for (i=0; i<num_FFT_primes; i++) cout<<FFT_primes[i]<<"  ";
	cout<<"]"<<endl; );

#if 0
cout<<"useFFTtable : numprimes="<<num_FFT_primes<<"  maxroot="<<max_degree<<"  mod.bitlength-1="<<m.mod().bit_length()-1<<endl;	
for (int qwe=0; qwe<num_FFT_primes; qwe++)
  cout<<FFT_primes[qwe]<<"  ";
cout<<endl;
#endif

    return p;
};		



void fft_table::seek_primes(lidia_size_t b_log, lidia_size_t sum_log)
//enlarges FFT_primes[] by new_num primes where
//sum_{i=0..new_num-1}(log_2(new_primes[i])) + sum_log > b_log
{
    debug_handler( "fft_table", "seek_primes( lidia_size_t, lidia_size_t )" );
    lidia_size_t old_num = num_FFT_primes;
    sdigit* help = 0;
    sdigit q, p;

    if (num_FFT_primes == 0)
	p = ~0;
    else
	p = FFT_primes[num_FFT_primes - 1];

    lidia_size_t new_num = 0;
    lidia_size_t num, j;

    debug_handler_c( "fft_table", "seek_primes", 1, cout<<"old : \n [";
		for (j=0; j<num_FFT_primes; j++) cout<<FFT_primes[j]<<"  ";
		cout<<"]"<<endl; );

    for (num = 0; b_log >= sum_log; num++)
    {
	if ((num % 64) == 0)
	{//need space for another 64 primes
	    sdigit* help2;
	    if (!(help2 = new sdigit[num + 64]))
		lidia_error_handler( "fft_table", "seek_primes( lidia_size_t, lidia_size_t )::out of memory" );
	    for (j = 0; j < num; j++) help2[j] = help[j];
		delete[] help;
	    help = help2;
	}

	q = next_prime(p);		//q is prime and < p
	help[num] = q;
	sum_log += integer_log(q);  
	p = q;
    }

//copy results
    sdigit* new_FFT_primes;
    new_num = old_num + num;
    if (!(new_FFT_primes = new sdigit[new_num]))
	lidia_error_handler( "fft_table", "seek_primes( lidia_size_t, lidia_size_t )::out of memory" );
	
    for (j = 0; j < old_num; j++)  new_FFT_primes[j]         = FFT_primes[j];
    for (j = 0; j < num;     j++)  new_FFT_primes[j+old_num] = help[j];

    delete[] FFT_primes;
    FFT_primes = new_FFT_primes;
    num_FFT_primes = new_num;

    debug_handler_c( "fft_table", "seek_primes", 1,
		cout<<"new : \n [";
		for (j=0; j<num_FFT_primes; j++) cout<<FFT_primes[j]<<"  "; 
		cout<<"]"<<endl; );

}





#if (FFT_PIPELINE)

/*****************************************************

   This version of the FFT is written with an explicit
   "software pipeline".  On some machines, this yields faster code.
   Most effective in conjunction with ZZ_AVOID_BRANCHING.

*******************************************************/

void fft_table::FFT(sdigit* A, const sdigit* a, lidia_size_t k, sdigit q, const sdigit* root)
// performs a 2^k-point convolution modulo q
{
    debug_handler( "fft_table", "FFT ( sdigit*, sdigit*, lidia_size_t, sdigit, sdigit* )" );

    if (k == 0)
    {
	A[0] = a[0];
	return;
    }

    if (k == 1)
    {
	A[0] = udigit_add_mod(a[0], a[1], q);
	A[1] = udigit_subtract_mod(a[0], a[1], q);
	return;
    }

    // assume k > 1

    lidia_size_t n = 1 << k;
    lidia_size_t s, m, m2, j;
    sdigit t, u, v, w, z, tt;
    sdigit *p1, *p2, *ub, *ub1;
#ifdef LIDIA_MUL_MOD_SPECIAL
    double qinv = ((double) 1)/((double) q);
    double wqinv, zqinv;
#endif

    BitReverse.copy(A, a, k);

    ub = A+n;

    p2 = A;
    while (p2 < ub)
    {
	u = *p2;
	v = *(p2+1);
	*p2 = udigit_add_mod(u, v, q);
	*(p2+1) = udigit_subtract_mod(u, v, q);
	p2 += 2;
    }

    for (s = 2; s < k; s++)
    {
	m = 1 << s;
	m2 = m >> 1;

	p2 = A;
	p1 = p2 + m2;
	while (p2 < ub)
	{
	    u = *p2;
	    v = *p1;
	    *p2 = udigit_add_mod(u, v, q);
	    *p1 = udigit_subtract_mod(u, v, q);
	    p1 += m;
	    p2 += m;
	}

	z = root[s];
	w = z;
	for (j = 1; j < m2; j++)
	{
#ifdef LIDIA_MUL_MOD_SPECIAL
	    wqinv = ((double) w)*qinv;
#endif
	    p2 = A + j;
	    p1 = p2 + m2;

	    ub1 = ub-m;

	    u = *p2;
#ifndef LIDIA_MUL_MOD_SPECIAL
	    t = udigit_multiply_mod(*p1, w, q);
#else
	    t = mul_mod_sp2(*p1, w, q, wqinv);
#endif

	    while (p2 < ub1)
	    {
#ifndef LIDIA_MUL_MOD_SPECIAL
		tt = udigit_multiply_mod(*(p1+m), w, q);
#else
		tt = mul_mod_sp2(*(p1+m), w, q, wqinv);
#endif
		(*p2) = udigit_add_mod(u, t, q);
		(*p1) = udigit_subtract_mod(u, t, q);
		p1 += m;
		p2 += m;
		u = *p2;
		t = tt;
	    }
	    (*p2) = udigit_add_mod(u, t, q);
	    (*p1) = udigit_subtract_mod(u, t, q);

#ifndef LIDIA_MUL_MOD_SPECIAL
	    w = udigit_multiply_mod(z, w, q);
#else
	    w = mul_mod_sp2(z, w, q, wqinv);
#endif
	}
    }

    m2 = n >> 1;
    z = root[k];
#ifdef LIDIA_MUL_MOD_SPECIAL
    zqinv = ((double) z)*qinv;
#endif
    w = 1;
    p2 = A;
    p1 = A + m2;
    m2--;
    u = *p2;
    t = *p1;
    while (m2)
    {
#ifndef LIDIA_MUL_MOD_SPECIAL
	w = udigit_multiply_mod(w, z, q);
	tt = udigit_multiply_mod(*(p1+1), w, q);
#else
	w = mul_mod_sp2(w, z, q, zqinv);
	tt = mul_mod_sp(*(p1+1), w, q, qinv);
#endif
	(*p2) = udigit_add_mod(u, t, q);
	(*p1) = udigit_subtract_mod(u, t, q);
	p2++;
	p1++;
	u = *p2;
	t = tt;
	m2--;
    }
    (*p2) = udigit_add_mod(u, t, q);
    (*p1) = udigit_subtract_mod(u, t, q);
}



#else


/*****************************************************

   This version of the FFT has no "software pipleline".

******************************************************/



void fft_table::FFT(sdigit* A, const sdigit* a, lidia_size_t k, sdigit q, const sdigit* root)
// performs a 2^k-point convolution modulo q
{
    debug_handler( "fft_table", "FFT ( sdigit*, sdigit*, lidia_size_t, sdigit, sdigit* )" );

    if (k == 0)
    {
	A[0] = a[0];
	return;
    }

    if (k == 1)
    {
	A[0] = udigit_add_mod(a[0], a[1], q);
	A[1] = udigit_subtract_mod(a[0], a[1], q);
	return;
    }

    // assume k > 1

    lidia_size_t n = 1 << k;
    lidia_size_t s, m, m2, j;
    sdigit t, u, v, w, z;
    sdigit *p1, *p2, *ub;
#ifdef LIDIA_MUL_MOD_SPECIAL
    double qinv = ((double) 1)/((double) q);
    double wqinv, zqinv;
#endif

    BitReverse.copy(A, a, k);

    ub = A+n;

    p2 = A;
    while (p2 < ub)
    {
	u = *p2;
	v = *(p2+1);
	(*p2) = udigit_add_mod(u, v, q);
	(*(p2+1)) = udigit_subtract_mod(u, v, q);
	p2 += 2;
    }

    for (s = 2; s < k; s++)
    {
	m = 1 << s;
	m2 = m >> 1;

	p2 = A;
	p1 = p2 + m2;
	while (p2 < ub)
	{
	    u = *p2;
	    v = *p1;
	    (*p2) = udigit_add_mod(u, v, q);
	    (*p1) = udigit_subtract_mod(u, v, q);
	    p1 += m;
	    p2 += m;
	}

	z = root[s];
	w = z;
	for (j = 1; j < m2; j++)
	{
#ifdef LIDIA_MUL_MOD_SPECIAL
	    wqinv = ((double) w)*qinv;
#endif
	    p2 = A + j;
	    p1 = p2 + m2;
	    while (p2 < ub)
	    {
		u = *p2;
		v = *p1;
#ifndef LIDIA_MUL_MOD_SPECIAL
		t = udigit_multiply_mod(v, w, q);
#else
		t = mul_mod_sp2(v, w, q, wqinv);
#endif
		(*p2) = udigit_add_mod(u, t, q);
		(*p1) = udigit_subtract_mod(u, t, q);
		p1 += m;
		p2 += m;
	    }
#ifndef LIDIA_MUL_MOD_SPECIAL
	    w = udigit_multiply_mod(z, w, q);
#else
	    w = mul_mod_sp2(z, w, q, wqinv);
#endif
	}
    }

    m2 = n >> 1;
    z = root[k];
#ifdef LIDIA_MUL_MOD_SPECIAL
    zqinv = ((double) z)*qinv;
#endif
    w = 1;
    p2 = A;
    p1 = A + m2;
    for (j = 0; j < m2; j++)
    {
	u = *p2;
	v = *p1;
#ifndef LIDIA_MUL_MOD_SPECIAL
	t = udigit_multiply_mod(v, w, q);
#else
	t = mul_mod_sp(v, w, q, qinv);
#endif
	(*p2) = udigit_add_mod(u, t, q);
	(*p1) = udigit_subtract_mod(u, t, q);
#ifndef LIDIA_MUL_MOD_SPECIAL
	w = udigit_multiply_mod(w, z, q);
#else
	w = mul_mod_sp2(w, z, q, zqinv);
#endif
	p2++;
	p1++;
    }
}

#endif


fft_table* fft_table::head = 0;


fft_table* fft_table::look_for_fft_table(lidia_size_t l)
	//returns pointer to fft_table which is at least built for
	//convolutions of degree 2^l
{
    debug_handler( "fft_table", "look_for_fft_table( lidia_size_t )" );
    
    fft_table *p = 0;

    if (fft_table::head != 0)
    {//if list is not empty

	if (fft_table::head->max_degree >= l)	//if *head is sufficiently large
	    return fft_table::head;

//need a new fft_table :

	if (fft_table::head->reference_counter == 0)
	{//if *head has no references
	    p = head->next;
	    delete fft_table::head;
	    fft_table::head = p;
	}
    }
	
    //insert new fft_table
    p = new fft_table(l);
    p->next = fft_table::head;
    fft_table::head = p;

    return fft_table::head;
}


void fft_table::decr_reference(fft_table* fptr)
//strategy: delete *fptr, iff it has no references and fptr!=head
//very often, we can reuse *head
{
    if (fptr == 0)
	lidia_error_handler( "fft_table", "decr_reference::argument is null pointer" );

    fptr->reference_counter--;

    if (fptr->reference_counter == 0 && fptr != fft_table::head)
    {//delete *fptr
	if (fft_table::head == 0)
	    lidia_error_handler( "fft_table", "decr_reference::list is empty" );
	if (fft_table::head == fptr)
	    fft_table::head = fft_table::head->next;
	else
	{
	    fft_table* tmp = fft_table::head;
	    while (tmp->next != fptr && tmp->next != 0)
		tmp = tmp->next;
	    if (tmp->next == 0)
		lidia_error_handler( "fft_table", "decr_reference::item not found" );

	    //now, tmp->next == fptr
	    tmp->next = fptr->next;
	}
	delete fptr;
    }
}



/***************************************************************
					class fft_data
***************************************************************/

void fft_data::clear()
{
    debug_handler( "fft_data", "clear( void )" );

    if (FT != 0)
	fft_table::decr_reference(FT);

//	FT = 0;
}

void fft_data::init(lidia_size_t l, const mcp& m)
{
    debug_handler( "fft_data", "init( lidia_size_t, mcp& )" );
    clear();
    FT = fft_table::look_for_fft_table(l);
    FT->reference_counter++;
    CT = FT->use_fft_table(m);
}

void fft_data::init(const fft_data& x)
{
    debug_handler( "fft_data", "init( const fft_data& )" );
    FT = x.FT;
    CT = x.CT;
    if (FT != 0)
	FT->reference_counter++;
}

bool fft_data::set_new_length(lidia_size_t l)
{
    debug_handler( "fft_data", "set_new_length( lidia_size_t )" );
    if (CT == 0)
	lidia_error_handler( "fft_data", "set_new_length( lidia_size_t )::CT == 0" );

    fft_table* FT_new = fft_table::look_for_fft_table(l);
    bool changed = false;

    if (FT_new != FT)
    {
	const mcp& mptr = CT->mod;
	if (FT != 0)
	    fft_table::decr_reference(FT);
	FT = FT_new;
	FT->reference_counter++;
	CT = FT->use_fft_table(mptr);
	changed = true;
    }
    return changed;
}



					

void fft_data::divide_by_power_of_two(sdigit* vec, lidia_size_t length, lidia_size_t l, lidia_size_t index) const
	//	divides vec[0..l-1] by 2^l mod FFT_prime[index]
	//l must be <= max_degree
{
    debug_handler( "fft_data", "divide_by_power_of_two( sdigit*, lidia_size_t, lidia_size_t, lidia_size_t)" );
    if ((l > FT->max_degree) || (index > FT->num_FFT_primes))
	lidia_error_handler( "fft_data", "divide_by_power_of_two( sdigit*, lidia_size_t, lidia_size_t, lidia_size_t)::argument too large" );
	
    lidia_size_t i;
    sdigit t = FT->TwoInvTable[index][l];
    sdigit q = FT->FFT_primes[index];
#ifdef LIDIA_MUL_MOD_SPECIAL
    double qinv = (1.0 / (double) q);
#endif

    sdigit *vp = vec;
    for (i = length; i != 0; i--, vp++)
    {
#ifndef LIDIA_MUL_MOD_SPECIAL
	*vp = udigit_multiply_mod(*vp, t, q);
#else
	*vp = mul_mod_sp(*vp, t, q, qinv);
#endif
    } 	
}



void fft_data::evaluate(sdigit* A, const sdigit* a, lidia_size_t k, lidia_size_t ix) const
{
    debug_handler( "fft_data" , "evaluate( sdigit*, const sdigit*, lidia_size_t, lidia_size_t )" );
    if ((k > FT->max_degree) || (ix > FT->num_FFT_primes))
	lidia_error_handler( "fft_data" , "evaluate( sdigit*, const sdigit*, lidia_size_t, lidia_size_t )::index out of range");

    FT->FFT(A, a, k, FT->FFT_primes[ix], FT->RootTable[ix]);
}


void fft_data::interpolate(sdigit* A, const sdigit* a, lidia_size_t k, lidia_size_t ix) const
{
    debug_handler( "fft_data" , "interpolate( sdigit*, const sdigit*, lidia_size_t, lidia_size_t )" );
    if ((k > FT->max_degree) || (ix > FT->num_FFT_primes))
	lidia_error_handler( "fft_data" , "interpolate( sdigit*, const sdigit*, lidia_size_t, lidia_size_t )::index out of range" );
	
    FT->FFT(A, a, k, FT->FFT_primes[ix], FT->RootInvTable[ix]);
}


void fft_data::pointwise_multiply(sdigit* x, const sdigit* a, const sdigit* b, lidia_size_t k, lidia_size_t ix) const
	//x[i] = a[i]*b[i] mod FFT_prime[ix]; i = 0 .. (2^k-1)
{
    debug_handler( "fft_data", "pointwise_multiply( sdigit*, const sdigit*, const sdigit*, lidia_size_t, lidia_size_t )" );
//	if ((k > FT->max_degree) || (ix > FT->num_FFT_primes))
//	  	lidia_error_handler( "fft_data", "pointwise_multiply( sdigit*, const sdigit*, const sdigit*, lidia_size_t, lidia_size_t )::index out of range" );
  
    sdigit q = FT->FFT_primes[ix];
#ifdef LIDIA_MUL_MOD_SPECIAL
    double qinv = (1.0 / (double) q);
#endif
    lidia_size_t K = 1 << k;
    sdigit *xp(x);
    const sdigit *ap(a), *bp(b);
    for (lidia_size_t i = K; i != 0; i--, xp++, ap++, bp++)
    {
#ifndef LIDIA_MUL_MOD_SPECIAL
	*xp = udigit_multiply_mod(*ap, *bp, q);
#else
	*xp = mul_mod_sp(*ap, *bp, q, qinv);
#endif
    }

#if 0
slower variants (maybe they are faster if 'udigit_multiply_mod' changes):
    switch(k)
    {
	case 0: *x = udigit_multiply_mod(*a, *b, q, qinv); break;
	case 1: x[0] = udigit_multiply_mod(a[0], b[0], q, qinv);
		x[1] = udigit_multiply_mod(a[1], b[1], q, qinv); break;
	case 2: x[0] = udigit_multiply_mod(a[0], b[0], q, qinv);
		x[1] = udigit_multiply_mod(a[1], b[1], q, qinv);
		x[2] = udigit_multiply_mod(a[2], b[2], q, qinv);
		x[3] = udigit_multiply_mod(a[3], b[3], q, qinv); break;
	case 3: x[0] = udigit_multiply_mod(a[0], b[0], q, qinv);
		x[1] = udigit_multiply_mod(a[1], b[1], q, qinv); 
		x[2] = udigit_multiply_mod(a[2], b[2], q, qinv); 
		x[3] = udigit_multiply_mod(a[3], b[3], q, qinv);
		x[4] = udigit_multiply_mod(a[4], b[4], q, qinv);
		x[5] = udigit_multiply_mod(a[5], b[5], q, qinv);
		x[6] = udigit_multiply_mod(a[6], b[6], q, qinv);
		x[7] = udigit_multiply_mod(a[7], b[7], q, qinv); break;

	default :
		lidia_size_t K = 1 << (k-3);

		sdigit *xp(x);
		const sdigit *ap(a), *bp(b); 
		for (lidia_size_t i = 0; i < K; i++, xp+=8, ap+=8, bp+=8)
		{
		    xp[0] = udigit_multiply_mod(ap[0], bp[0], q, qinv);
		    xp[1] = udigit_multiply_mod(ap[1], bp[1], q, qinv);
		    xp[2] = udigit_multiply_mod(ap[2], bp[2], q, qinv);
		    xp[3] = udigit_multiply_mod(ap[3], bp[3], q, qinv);
		    xp[4] = udigit_multiply_mod(ap[4], bp[4], q, qinv);
		    xp[5] = udigit_multiply_mod(ap[5], bp[5], q, qinv);
		    xp[6] = udigit_multiply_mod(ap[6], bp[6], q, qinv);
		    xp[7] = udigit_multiply_mod(ap[7], bp[7], q, qinv);

//	  		*xp = udigit_multiply_mod(*ap, *bp, q, qinv);
//			xp++; ap++; bp++;
//			*xp = udigit_multiply_mod(*ap, *bp, q, qinv);
//			xp++; ap++; bp++;
//			*xp = udigit_multiply_mod(*ap, *bp, q, qinv);
//			xp++; ap++; bp++;
//			*xp = udigit_multiply_mod(*ap, *bp, q, qinv);
//			xp++; ap++; bp++;
//			*xp = udigit_multiply_mod(*ap, *bp, q, qinv);
//			xp++; ap++; bp++;
//			*xp = udigit_multiply_mod(*ap, *bp, q, qinv);
//			xp++; ap++; bp++;
//			*xp = udigit_multiply_mod(*ap, *bp, q, qinv);
//			xp++; ap++; bp++;
//			*xp = udigit_multiply_mod(*ap, *bp, q, qinv);
//			xp++; ap++; bp++;
		}
		//for (lidia_size_t i = 0; i < (1<<k); i++)
	  		//x[i] = udigit_multiply_mod(a[i], b[i], q, qinv);
	}
#endif
}


void fft_data::pointwise_add(sdigit* x, const sdigit* a, const sdigit* b, lidia_size_t k, lidia_size_t ix) const
{
    debug_handler( "fft_data", "pointwise_add( sdigit*, const sdigit*, const sdigit*, lidia_size_t, lidia_size_t )" );

    lidia_size_t  K = 1 << k;
    sdigit q = FT->FFT_primes[ix];
	
    sdigit *xp(x);
    const sdigit *ap(a), *bp(b);
    for (lidia_size_t j = K; j != 0; j--,xp++,ap++,bp++)
	*xp = udigit_add_mod(*ap, *bp, q);
}

void fft_data::pointwise_subtract(sdigit* x, const sdigit* a, const sdigit* b, lidia_size_t k, lidia_size_t ix) const
{
    debug_handler( "fft_data", "pointwise_subtract( sdigit*, const sdigit*, const sdigit*, lidia_size_t, lidia_size_t )" );
    lidia_size_t  K = 1 << k;
    sdigit q = FT->FFT_primes[ix];

    sdigit *xp(x);
    const sdigit *ap(a), *bp(b);
    for (lidia_size_t j = K; j != 0; j--,xp++,ap++,bp++)
	*xp = udigit_subtract_mod(*ap, *bp, q);
}

void fft_data::pointwise_add_mul(sdigit* x, const sdigit* a, const sdigit* b, const sdigit* c, const sdigit* d, lidia_size_t k, lidia_size_t ix) const
{
    debug_handler( "fft_data", "pointwise_add_mul( sdigit*, const sdigit*, const sdigit*, const sdigit*, const sdigit*, lidia_size_t, lidia_size_t )" );
    lidia_size_t K = 1 << k;
    sdigit q = FT->FFT_primes[ix];
#ifdef LIDIA_MUL_MOD_SPECIAL
    double qinv = (1.0 / (double) q);
#endif

    sdigit *xp(x);
    const sdigit *ap(a), *bp(b), *cp(c), *dp(d);
    for (lidia_size_t j = K; j != 0; j--,xp++,ap++,bp++,cp++,dp++)
    {
#ifndef LIDIA_MUL_MOD_SPECIAL
	*xp =udigit_add_mod(udigit_multiply_mod(*ap, *bp, q),
			    udigit_multiply_mod(*cp, *dp, q), q);
#else
	*xp =udigit_add_mod(mul_mod_sp(*ap, *bp, q, qinv),
			    mul_mod_sp(*cp, *dp, q, qinv), q);
#endif
    }
}




void fft_table::info()
{
    cout<<"\nfft_table::info ================================\n";
    fft_table* p = fft_table::head;
    fft_crt_table* pp;
    
    while (p != 0)
    {
    	lidia_size_t i;
	cout<<"address "<<p<<endl;
	cout<<"  max_degree = "<<p->max_degree<<endl;
    	cout<<"  FFT_primes\n  [ ";
	for(i=0; i < p->num_FFT_primes; i++)
    	    cout<<p->FFT_primes[i]<<"  ";
	cout<<"]\n  reference_counter = "<<p->reference_counter<<endl;

	pp = p->CRT_table_list;
    	while (pp != 0)
	{
    	    pp->info();
	    pp = pp->next;
	}
	cout<<"\n  ----------------------------------"<<endl;

        p = p->next;
    }
    cout<<"end fft_table::info\n"<<endl;
}



void fft_table::fft_crt_table::info() const
{
    cout<<"    fft_crt_table: modulus = "<<mod.mod()<<endl;
}

