/*
    -- MAGMA (version 2.6.2) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date March 2022

       @generated from magmablas/zlarfgx-v2.cu, normal z -> s, Mon Mar 21 16:53:16 2022

*/
#include "magma_internal.h"
#include "commonblas_s.h"

// 512 is maximum number of threads for CUDA capability 1.x
#define BLOCK_SIZE 512

#define REAL


/******************************************************************************/
__global__
void magma_slarfgx_gpu_kernel( int n, float* dx0, float* dx,
                               float *dtau, float *dxnorm,
                               float *dA, int it)
{
    const int i = threadIdx.x;
    const int j = i + BLOCK_SIZE * blockIdx.x;
    __shared__ float scale;
    __shared__ float xnorm;
  
    float dxi;

    if ( j < n-1 )
        dxi = dx[j];
  
    if ( i == 0 ) {
        xnorm = *dxnorm;
        #ifdef REAL
        float alpha = *dx0;
        float alphai = MAGMA_S_ZERO;
        if ( (xnorm == 0 && alphai == MAGMA_S_ZERO ) || n == 1 )
        #else
        float alpha = *dx0;
        float alphar =  MAGMA_S_REAL(alpha), alphai = MAGMA_S_IMAG(alpha);
        if ( (xnorm == 0 && alphai == MAGMA_S_ZERO ) || n == 0 )
        #endif
        {
            *dtau = MAGMA_S_ZERO;
            *dA   = *dx0;
        }
        else {
            #ifdef REAL
                // no need to compute the norm as it is passed as input
                float beta  = xnorm; // sqrt( alpha*alpha + xnorm*xnorm );
                beta  = -copysign( beta, alpha );
     
                // todo: deal with badly scaled vectors (see lapack's larfg)
                if (j == 0) {
                    *dtau = (beta - alpha) / beta;
                    //*dx0  = 1.; //cannot be done here because raise condition all threadblock need to read it for alpha
                    *dA   = beta;
                }
    
                scale = 1. / (alpha - beta);
            #else
                // no need to compute the norm as it is passed as input
                float beta  = xnorm; // sqrt( alphar*alphar + alphai*alphai + xnorm*xnorm );
                beta  = -copysign( beta, alphar );
    
                // todo: deal with badly scaled vectors (see lapack's larfg)
                if (j == 0) {
                    *dtau = MAGMA_S_MAKE((beta - alphar)/beta, -alphai/beta);
                    //*dx0  = MAGMA_S_MAKE(  1., 0.); //cannot be done here because raise condition all threadblock need to read it for alpha
                    *dA   = MAGMA_S_MAKE(beta, 0.);
                }
    
                alpha = MAGMA_S_MAKE( MAGMA_S_REAL(alpha) - beta, MAGMA_S_IMAG(alpha));
                scale = MAGMA_S_DIV( MAGMA_S_ONE, alpha);
            #endif
        }
    }

    // scale x
    __syncthreads();
    if ( xnorm != 0 && j < n-1)
        dx[j] = MAGMA_S_MUL(dxi, scale);

    if (j < it) {
        *( dA-it+j) = *(dx0-it+j);
        *(dx0-it+j) = MAGMA_S_MAKE(0., 0.);
    }
}


/***************************************************************************//**
    Generates Householder elementary reflector H = I - tau v v^T to reduce
        H [ dx0 ] = [ beta ]
          [ dx  ]   [ 0    ]
    with |beta| = norm( [dx0, dx] ) = dxnorm[0].
    Stores v over dx; first element of v is 1 and is not stored.
    Stores beta over dx0.
    Stores tau.
    
    The difference with LAPACK's slarfg is that the norm of dx, and hance beta,
    are computed outside the routine and passed to it in dxnorm (array on the GPU).
*******************************************************************************/
extern "C" void
magma_slarfgx_gpu(
    magma_int_t n,
    magmaFloat_ptr dx0,
    magmaFloat_ptr dx,
    magmaFloat_ptr dtau,
    magmaFloat_ptr        dxnorm,
    magmaFloat_ptr dA, magma_int_t iter, 
    magma_queue_t queue )
{
    dim3 blocks( magma_ceildiv( n, BLOCK_SIZE ) );
    dim3 threads( BLOCK_SIZE );
 
    magma_slarfgx_gpu_kernel
        <<< blocks, threads, 0, queue->cuda_stream() >>>
        ( n, dx0, dx, dtau, dxnorm, dA, iter);
}


/***************************************************************************//**
    Generates Householder elementary reflector H = I - tau v v^T to reduce
        H [ dx0 ] = [ beta ]
          [ dx  ]   [ 0    ]
    with |beta| = norm( [dx0, dx] ) = dxnorm[0].
    Stores v over dx; first element of v is 1 and is not stored.
    Stores beta over dx0.
    Stores tau.
    
    The difference with LAPACK's slarfg is that the norm of dx, and hance beta,
    are computed outside the routine and passed to it in dxnorm (array on the GPU).
*******************************************************************************/
extern "C" void
magma_slarfgtx_gpu(
    magma_int_t n,
    magmaFloat_ptr dx0,
    magmaFloat_ptr dx,
    magmaFloat_ptr dtau,
    magmaFloat_ptr        dxnorm,
    magmaFloat_ptr dA, magma_int_t iter,
    magmaFloat_ptr V,  magma_int_t ldv,
    magmaFloat_ptr T,  magma_int_t ldt,
    magmaFloat_ptr dwork,
    magma_queue_t queue )
{
    /*  Generate the elementary reflector H(iter)  */
    magma_slarfgx_gpu(n, dx0, dx, dtau, dxnorm, dA, iter, queue);
    
    if (iter == 0) {
        float tt = MAGMA_S_ONE;
        magmablas_slacpy( MagmaFull, 1, 1, dtau, 1, T+iter+iter*ldt, 1, queue );
        magma_ssetmatrix( 1, 1, &tt, 1, dx0, 1, queue );
    }
    else {
        /* Compute the iter-th column of T */
        magma_sgemv_kernel3
            <<< iter, BLOCK_SIZE, 0, queue->cuda_stream() >>>
            ( n, V, ldv, dx0, dwork, dtau );
        
        magma_strmv_kernel2
            <<< iter, iter,       0, queue->cuda_stream() >>>
            ( T, ldt, dwork, T+iter*ldt, dtau );
    }
}
