/*
  File: getpic.cc

*/

#include <string.h>
#include "all.hh"

#ifdef HAVE_MMX
extern "C" void IDCT_mmx(short *);
extern "C" void add_block_mmx(unsigned char *, short *, int);
extern "C" void set_block_mmx(unsigned char *, short *, int);
static unsigned long long MMX_128 = 0x80008000800080LL;

#endif

/* ---------------------------------------------------
*/
static void j_rev_dct_sparse (short *data)
{
  short int val;
  int v;
  int quant;
  v = *data;
#ifdef HAVE_MMX
  quant = 8 * 16;
#else
  quant = 8;
#endif
  if (v < 0) {
    val = -v;
    val += (quant >> 1);
    val /= quant;
    val = -val;
  }
  else {
    val = (v + (quant >> 1)) / quant;
  }
  *data = val;
  return;
}


// decode one frame or field picture

void LayerData::getpicture(int framenum){
  if (pict_struct==FRAME_PICTURE && secondfield){
    /* recover from illegal number of field pictures */
    printf("odd number of field pictures\n");
    secondfield = 0;
  }

  for (int i=0; i<3; i++){
    if (pict_type==B_TYPE) newframe[i] = auxframe[i];
    else {
      if (!secondfield){
        unsigned char* tmp = oldrefframe[i];
        oldrefframe[i] = refframe[i];
        refframe[i] = tmp;
      }

      newframe[i] = refframe[i];
    }

    if (pict_struct==BOTTOM_FIELD)
      newframe[i]+= (i==0) ? coded_picture_width : chrom_width;
  }

//if (pict_scal && !secondfield) getspatref();

  getMBs(framenum);

  if (framenum!=0){
    if (pict_struct==FRAME_PICTURE || secondfield){
      if (pict_type==B_TYPE) display->dither(auxframe);
      else                   display->dither(oldrefframe);
    }
    else display->display_second_field();
  }

  if (pict_struct!=FRAME_PICTURE) secondfield = !secondfield;
}


// store last frame

void LayerData::putlast(){
  if (secondfield) printf("last frame incomplete, not stored\n");
  else display->dither(refframe);
}


/* decode all macroblocks of the current picture */

void LayerData::getMBs(int framenum){
  int comp;
  int MBA, MBAmax, MBAinc, mb_type, cbp, motion_type(0), dct_type;
  int slice_vert_pos_ext;
  int bx, by;
  unsigned int code;
  int dc_dct_pred[3];
  int mv_count, mv_format, mvscale;
  int PMV[2][2][2], mv_field_sel[2][2];
  int dmv, dmvector[2];
  int qs;
  int stwtype, stwclass; 
  int SNRcbp;
//  int SNRMBA(0), SNRmb_type,, SNRMBAinc(0), SNRdct_type, dummy; // SNR scal.

  /* number of macroblocks per picture */
  MBAmax = mb_width*mb_height;

  if (pict_struct!=FRAME_PICTURE)
    MBAmax>>=1; /* field picture has half as mnay macroblocks as frame */

  MBA = 0; /* macroblock address */
  MBAinc = 0;

/*
  if (twostreams && enhan.scalable_mode==SC_SNR){
    SNRMBA=0;
    SNRMBAinc=0;
  }
*/

  fault=0;

  for (;;){
#ifdef TRACE
    if (trace) printf("frame %d, MB %d\n",framenum,MBA);
#endif

    if (!prog_seq && pict_struct==FRAME_PICTURE && MBA==(MBAmax>>1) &&
        framenum!=0)
// && display->getType()==T_X11)
      display->display_second_field();

//  ld = &base;
    if (MBAinc==0){
//      if (scalable_mode==SC_DP && pri_brk==1) ld = &enhan;

      if (!input->showbits(23) || fault){ /* startcode or fault */
resync: /* if fault: resynchronize to next startcode */
        fault = 0;

        if (MBA>=MBAmax) return; /* all macroblocks decoded */

        code=input->startcode();
        if (code<Slice_min_start || code>Slice_max_start){
          /* only slice headers are allowed in picture_data */
          if (!quiet) printf("Premature end of picture\n");
          return;
        }
        input->flushbits(32);

        /* decode slice header (may change quant_scale) */
        slice_vert_pos_ext = getslicehdr();

/*
        if (scalable_mode==SC_DP){
          ld = &enhan;
          input->startcode();
          code = input->showbits(32);

          if (code<SLICE_MIN_START || code>SLICE_MAX_START){
            // only slice headers are allowed in picture_data
            if (!quiet) printf("Premature end of picture\n");
            return;
          }

          input->flushbits(32);

          // decode slice header (may change quant_scale)
          slice_vert_pos_ext = getslicehdr();

//          if (pri_brk!=1) ld = &base;
        }
*/

        /* decode macroblock address increment */
        MBAinc = getMBA();

        if (fault) goto resync;

        /* set current location */
        MBA = ((slice_vert_pos_ext<<7) + (code&255) - 1)*mb_width + MBAinc - 1;
        MBAinc = 1; /* first macroblock in slice: not skipped */

        /* reset all DC coefficient and motion vector predictors */
        dc_dct_pred[0]=dc_dct_pred[1]=dc_dct_pred[2]=0;
        PMV[0][0][0]=PMV[0][0][1]=PMV[1][0][0]=PMV[1][0][1]=0;
        PMV[0][1][0]=PMV[0][1][1]=PMV[1][1][0]=PMV[1][1][1]=0;
      }
      else { /* neither startcode nor fault */
        if (MBA>=MBAmax){
          if (!quiet) printf("Too many macroblocks in picture\n");
          return;
        }

/*
        if (scalable_mode==SC_DP && pri_brk==1) ld = &enhan;
*/
        /* decode macroblock address increment */
        MBAinc = getMBA();

        if (fault) goto resync;
      }
    }

    if (MBA>=MBAmax){
      /* MBAinc points beyond picture dimensions */
      if (!quiet) printf("Too many macroblocks in picture\n");
      return;
    }

    if (MBAinc==1) /* not skipped */{
/*
      if (scalable_mode==SC_DP){
        if (pri_brk<=2) ld = &enhan;
        else ld = &base;
      }
*/

      macroblock_modes(&mb_type, &stwtype, &stwclass,
        &motion_type, &mv_count, &mv_format, &dmv, &mvscale, &dct_type);

      if (fault) goto resync;

      if (mb_type & MB_QUANT){
        qs = input->getbits(5);

#ifdef TRACE
        if (trace){
          printf("quantiser_scale_code (");
          printbits(qs,5,5);
          printf("): %d\n",qs);
        }
#endif

        if (mpeg2)
             quant_scale = qscale_type ? non_linear_mquant_table[qs] : (qs << 1);
        else quant_scale = qs;

        if (scalable_mode==SC_DP)
          /* make sure quant_scale is valid */
          quant_scale = quant_scale;
      }

      /* motion vectors */

      /* decode forward motion vectors */
      if ((mb_type & MB_FORWARD) || ((mb_type & MB_INTRA) && conceal_mv)){
        if (mpeg2)
          motion_vectors(PMV,dmvector,mv_field_sel,
            0,mv_count,mv_format,h_forw_r_size,v_forw_r_size,dmv,mvscale);
        else
          motion_vector(PMV[0][0],dmvector,
            forw_r_size,forw_r_size,0,0,full_forw);
      }

      if (fault) goto resync;

      /* decode backward motion vectors */
      if (mb_type & MB_BACKWARD){
        if (mpeg2)
          motion_vectors(PMV,dmvector,mv_field_sel,
            1,mv_count,mv_format,h_back_r_size,v_back_r_size,0,mvscale);
        else
          motion_vector(PMV[0][1],dmvector,
            back_r_size,back_r_size,0,0,full_back);
      }

      if (fault) goto resync;

      if ((mb_type & MB_INTRA) && conceal_mv)
        input->flushbits(1); /* remove marker_bit */

/*
      if (scalable_mode==SC_DP && pri_brk==3) ld = &enhan;
*/

      /* macroblock_pattern */
      if (mb_type & MB_PATTERN){
        cbp = getCBP();
        if (chroma_format==CHROMA422){
          cbp = (cbp<<2) | input->getbits(2); /* coded_block_pattern_1 */

#ifdef TRACE
          if (trace){
            printf("coded_block_pattern_1: ");
            printbits(cbp,2,2);
            printf(" (%d)\n",cbp&3);
          }
#endif
        }
        else if (chroma_format==CHROMA444){
          cbp = (cbp<<6) | input->getbits(6); /* coded_block_pattern_2 */

#ifdef TRACE
          if (trace){
            printf("coded_block_pattern_2: ");
            printbits(cbp,6,6);
            printf(" (%d)\n",cbp&63);
          }
#endif
        }
      }
      else
        cbp = (mb_type & MB_INTRA) ? (1<<blk_cnt)-1 : 0;

      if (fault) goto resync;

      /* decode blocks */
      clearblock(0,blk_cnt);
      for (comp=0; comp<blk_cnt; comp++){
//        if (scalable_mode==SC_DP) ld = &base;


        if (cbp & (1<<(blk_cnt-1-comp))){
          if (mb_type & MB_INTRA){
            if (mpeg2) getmpg2intrablock(comp,dc_dct_pred);
            else           getintrablock(comp,dc_dct_pred);
          }
          else {
            if (mpeg2) getmpg2interblock(comp);
            else           getinterblock(comp);
          }

          if (fault) goto resync;
        }
      }

      /* reset intra_dc predictors */
      if (!(mb_type & MB_INTRA))
        dc_dct_pred[0]=dc_dct_pred[1]=dc_dct_pred[2]=0;

      /* reset motion vector predictors */
      if ((mb_type & MB_INTRA) && !conceal_mv){
        /* intra mb without concealment motion vectors */
        PMV[0][0][0]=PMV[0][0][1]=PMV[1][0][0]=PMV[1][0][1]=0;
        PMV[0][1][0]=PMV[0][1][1]=PMV[1][1][0]=PMV[1][1][1]=0;
      }

      if ((pict_type==P_TYPE) && !(mb_type & (MB_FORWARD|MB_INTRA))){
        /* non-intra mb without forward mv in a P picture */
        PMV[0][0][0]=PMV[0][0][1]=PMV[1][0][0]=PMV[1][0][1]=0;

        /* derive motion_type */
        if (pict_struct==FRAME_PICTURE) motion_type = MC_FRAME;
        else
        {
          motion_type = MC_FIELD;
          /* predict from field of same parity */
          mv_field_sel[0][0] = (pict_struct==BOTTOM_FIELD);
        }
      }

      if (stwclass==4)
      {
        /* purely spatially predicted macroblock */
        PMV[0][0][0]=PMV[0][0][1]=PMV[1][0][0]=PMV[1][0][1]=0;
        PMV[0][1][0]=PMV[0][1][1]=PMV[1][1][0]=PMV[1][1][1]=0;
      }
    }
    else { /* MBAinc!=1: skipped macroblock */
//      if (scalable_mode==SC_DP) ld = &base;

      clearblock(0,blk_cnt);

      /* reset intra_dc predictors */
      dc_dct_pred[0]=dc_dct_pred[1]=dc_dct_pred[2]=0;

      /* reset motion vector predictors */
      if (pict_type==P_TYPE)
        PMV[0][0][0]=PMV[0][0][1]=PMV[1][0][0]=PMV[1][0][1]=0;

      /* derive motion_type */
      if (pict_struct==FRAME_PICTURE)
        motion_type = MC_FRAME;
      else
      {
        motion_type = MC_FIELD;
        /* predict from field of same parity */
        mv_field_sel[0][0]=mv_field_sel[0][1] = (pict_struct==BOTTOM_FIELD);
      }

      /* skipped I are spatial-only predicted, */
      /* skipped P and B are temporal-only predicted */
      stwtype = (pict_type==I_TYPE) ? 8 : 0;

      /* clear MB_INTRA */
      mb_type&= ~MB_INTRA;

      cbp = 0; /* no block data */
    }

    SNRcbp = 0;

#ifdef ENHANCEMENT

    if (twostreams && enhan.scalable_mode==SC_SNR){
      ld = &enhan;
      if (SNRMBAinc==0){
        if (!input->showbits(23)){ /* startcode */
          code=input->startcode();
//          code = input->showbits(32);
          if (code<SLICE_MIN_START || code>SLICE_MAX_START){
            /* only slice headers are allowed in picture_data */
            if (!quiet)
              printf("Premature end of picture\n");
            return;
          }
          input->flushbits(32);

          /* decode slice header (may change quant_scale) */
          slice_vert_pos_ext = getslicehdr();

          /* decode macroblock address increment */
          SNRMBAinc = getMBA();

          /* set current location */
          SNRMBA =
            ((slice_vert_pos_ext<<7) + (code&255) - 1)*mb_width + SNRMBAinc - 1;

          SNRMBAinc = 1; /* first macroblock in slice: not skipped */
        }
        else { /* not startcode */
          if (SNRMBA>=MBAmax){
            if (!quiet) printf("Too many macroblocks in picture\n");
            return;
          }

          /* decode macroblock address increment */
          SNRMBAinc = getMBA();
        }
      }

      if (SNRMBA!=MBA){
        /* streams out of sync */
        if (!quiet) printf("Cant't synchronize streams\n");
        return;
      }

      if (SNRMBAinc==1){     /* not skipped */
        macroblock_modes(&SNRmb_type, &dummy, &dummy,
          &dummy, &dummy, &dummy, &dummy, &dummy,
          &SNRdct_type);

        if (SNRmb_type & MB_PATTERN) dct_type = SNRdct_type;

        if (SNRmb_type & MB_QUANT){
          qs = input->getbits(5);
          quant_scale = qscale_type ? non_linear_mquant_table[qs] : qs<<1;
        }

        /* macroblock_pattern */
        if (SNRmb_type & MB_PATTERN)
        {
          SNRcbp = getCBP();

          if (chroma_format==CHROMA422)
            SNRcbp = (SNRcbp<<2) | input->getbits(2); /* coded_block_pattern_1 */
          else if (chroma_format==CHROMA444)
            SNRcbp = (SNRcbp<<6) | input->getbits(6); /* coded_block_pattern_2 */
        }
        else
          SNRcbp = 0;

        /* decode blocks */
        clearblock(0,blk_cnt);
        for (comp=0; comp<blk_cnt; comp++)
        {

          if (SNRcbp & (1<<(blk_cnt-1-comp)))
            getmpg2interblock(comp);
        }
      }
      else /* SNRMBAinc!=1: skipped macroblock */
      {
        clearblock(0,blk_cnt);
      }

//      ld = &base;
    }
#endif

    /* pixel coordinates of top left corner of current macroblock */
    bx = 16*(MBA%mb_width);
    by = 16*(MBA/mb_width);

    /* motion compensation */
    if (!(mb_type & MB_INTRA))
      reconstruct(bx,by,mb_type,motion_type,PMV,mv_field_sel,dmvector,
                  stwtype);

#ifdef ENHANCEMENT
    if (scalable_mode==SC_DP) ld = &base;
#endif

    /* copy or add block data into picture */
    for (comp=0; comp<blk_cnt; comp++){
      if ((cbp|SNRcbp) & (1<<(blk_cnt-1-comp))){
#ifdef ENHANCEMENT
        if (twostreams && enhan.scalable_mode==SC_SNR &&
            SNRcbp & (1<<(blk_cnt-1-comp)))
          sumblock(comp); /* add SNR enhancement layer data to base layer */
#endif
        /* inverse DCT */
//        if (sparse[comp])
//           j_rev_dct_sparse(block[comp]);
//        else {
#ifdef HAVE_MMX
  	  IDCT_mmx(block[comp]);
#else
          idct->conversion(block[comp]);
#endif
//        }
        addblock(comp,bx,by,dct_type,(mb_type & MB_INTRA)==0);
      }
    }

    /* advance to next macroblock */
    MBA++;
    MBAinc--;

#ifdef ENHANCEMENT
    if (twostreams && enhan.scalable_mode==SC_SNR){
      SNRMBA++;
      SNRMBAinc--;
    }
#endif
  }
}


void LayerData::macroblock_modes(int *pmb_type, int *pstwtype, int *pstwclass,
  int *pmotion_type, int *pmv_count, int *pmv_format, int *pdmv, int *pmvscale,
  int *pdct_type)
{
  int mb_type;
  int stwtype, stwcode, stwclass;
  int motion_type(0), mv_count, mv_format, dmv, mvscale;
  int dct_type;
  static unsigned char stwc_table[3][4]
    = { {6,3,7,4}, {2,1,5,4}, {2,5,7,4} };
  static unsigned char stwclass_table[9]
    = {0, 1, 2, 1, 1, 2, 3, 3, 4};

  /* get macroblock_type */
  mb_type = getMBtype();

  if (fault) return;

  /* get spatial_temporal_weight_code */
  if (mb_type & MB_WEIGHT)
  {
    if (stwc_table_index==0)
      stwtype = 4;
    else
    {
      stwcode = input->getbits(2);
      stwtype = stwc_table[stwc_table_index-1][stwcode];
    }
  }
  else
    stwtype = (mb_type & MB_CLASS4) ? 8 : 0;

  /* derive spatial_temporal_weight_class (Table 7-18) */
  stwclass = stwclass_table[stwtype];

  /* get frame/field motion type */
  if (mb_type & (MB_FORWARD|MB_BACKWARD)){
    if (pict_struct==FRAME_PICTURE){ /* frame_motion_type */
      motion_type = frame_pred_dct ? MC_FRAME : input->getbits(2);
#ifdef TRACE
      if (!frame_pred_dct && trace){
        printf("frame_motion_type (");
        printbits(motion_type,2,2);
        printf("): %s\n",motion_type==MC_FIELD?"Field":
                         motion_type==MC_FRAME?"Frame":
                         motion_type==MC_DMV?"Dual_Prime":"Invalid");
      }
#endif
    }
    else { /* field_motion_type */
      motion_type = input->getbits(2);
#ifdef TRACE
      if (trace){
        printf("field_motion_type (");
        printbits(motion_type,2,2);
        printf("): %s\n",motion_type==MC_FIELD?"Field":
                         motion_type==MC_16X8?"16x8 MC":
                         motion_type==MC_DMV?"Dual_Prime":"Invalid");
      }
#endif
    }
  }
  else if ((mb_type & MB_INTRA) && conceal_mv)
  {
    /* concealment motion vectors */
    motion_type = (pict_struct==FRAME_PICTURE) ? MC_FRAME : MC_FIELD;
  }

  /* derive mv_count, mv_format and dmv, (table 6-17, 6-18) */
  if (pict_struct==FRAME_PICTURE)
  {
    mv_count = (motion_type==MC_FIELD && stwclass<2) ? 2 : 1;
    mv_format = (motion_type==MC_FRAME) ? MV_FRAME : MV_FIELD;
  }
  else
  {
    mv_count = (motion_type==MC_16X8) ? 2 : 1;
    mv_format = MV_FIELD;
  }

  dmv = (motion_type==MC_DMV); /* dual prime */

  /* field mv predictions in frame pictures have to be scaled */
  mvscale = ((mv_format==MV_FIELD) && (pict_struct==FRAME_PICTURE));

  /* get dct_type (frame DCT / field DCT) */
  dct_type = (pict_struct==FRAME_PICTURE)
             && (!frame_pred_dct)
             && (mb_type & (MB_PATTERN|MB_INTRA))
             ? input->getbits(1)
             : 0;

#ifdef TRACE
  if (trace  && (pict_struct==FRAME_PICTURE)
             && (!frame_pred_dct)
             && (mb_type & (MB_PATTERN|MB_INTRA)))
    printf("dct_type (%d): %s\n",dct_type,dct_type?"Field":"Frame");
#endif

  /* return values */
  *pmb_type = mb_type;
  *pstwtype = stwtype;
  *pstwclass = stwclass;
  *pmotion_type = motion_type;
  *pmv_count = mv_count;
  *pmv_format = mv_format;
  *pdmv = dmv;
  *pmvscale = mvscale;
  *pdct_type = dct_type;
}


/* set block to zero */
void LayerData::clearblock(int comp,int size){
  sparse[comp] = 1;
  memset(block[comp],0,sizeof(short)*64*size);
}


#ifdef ENHANCEMENT
/* add SNR enhancement layer block data to base layer */

void LayerData::sumblock(int comp){
  short *bp1, *bp2;

  bp1 = block[comp];
#ifdef ENHANCEMANT
  bp2 = enhan.block[comp];
#endif
  for (int i=0; i<64; i++) *bp1++ += *bp2++;
}
#endif

/* limit coefficients to -2048..2047 */

/* move/add 8x8-Block from block[comp] to refframe */

void LayerData::addblock(int comp, int bx, int by, int dct_type, int addflag){
  int cc,i, iincr;
  unsigned char *rfp;
  short *bp;
  int spar = sparse[comp];
#ifndef HAVE_MMX
  unsigned char *clp2=display->getClpTable();

  if (!addflag) clp2 += 128;
#endif

  cc = (comp<4) ? 0 : (comp&1)+1; /* color component index */

  if (cc==0){   /* luminance */

    if (pict_struct==FRAME_PICTURE)
      if (dct_type){
        /* field DCT coding */
        rfp = newframe[0]
              + coded_picture_width*(by+((comp&2)>>1)) + bx + ((comp&1)<<3);
        iincr = (coded_picture_width<<1);
      }
      else{
        /* frame DCT coding */
        rfp = newframe[0]
              + coded_picture_width*(by+((comp&2)<<2)) + bx + ((comp&1)<<3);
        iincr = coded_picture_width;
      }
    else {
      /* field picture */
      rfp = newframe[0]
            + (coded_picture_width<<1)*(by+((comp&2)<<2)) + bx + ((comp&1)<<3);
      iincr = (coded_picture_width<<1);
    }
  }
  else {
    /* chrominance */

    /* scale coordinates */
    if (chroma_format!=CHROMA444)  bx >>= 1;
    if (chroma_format==CHROMA420)  by >>= 1;
    if (pict_struct==FRAME_PICTURE){
      if (dct_type && (chroma_format!=CHROMA420)){
        /* field DCT coding */
        rfp = newframe[cc]
              + chrom_width*(by+((comp&2)>>1)) + bx + (comp&8);
        iincr = (chrom_width<<1);
      }
      else {
        /* frame DCT coding */
        rfp = newframe[cc]
              + chrom_width*(by+((comp&2)<<2)) + bx + (comp&8);
        iincr = chrom_width;
      }
    }
    else {
      /* field picture */
      rfp = newframe[cc]
            + (chrom_width<<1)*(by+((comp&2)<<2)) + bx + (comp&8);
      iincr = (chrom_width<<1);
    }
  }

  bp = block[comp];

  if (addflag) {
#ifdef HAVE_MMX
    if (spar) {
       __asm__ __volatile__(
            "movq       (%2),%%mm6\n"  /* 4 blockvals */
            "pxor       %%mm4,%%mm4\n"
            "punpcklwd  %%mm6,%%mm6\n"
            "punpcklwd  %%mm6,%%mm6\n"
            ".align 8\n"
            "1:"
                "movq       (%1),  %%mm0\n"     /* 8 rindex1 */
                "movq       %%mm0, %%mm2\n"
                "punpcklbw  %%mm4, %%mm0\n"
                "punpckhbw  %%mm4, %%mm2\n"
                "paddw      %%mm6, %%mm0\n"
                "paddw      %%mm6, %%mm2\n"

                "packuswb   %%mm2, %%mm0\n"
                "movq       %%mm0, (%1)\n"

                "leal       (%1,%3), %1\n"
              "loop       1b\n"
              :              /* scr   dest */
              : "c" (8),"r" (rfp), "r" (bp), "r" (iincr)
                );
    }
    else {
       __asm__ __volatile__(
         "pxor    %%mm4,%%mm4\n"

         ".align 8\n"
         "1:"
           "movq       (%2), %%mm0\n"   /* 8 rfp 0 1 2 3 4 5 6 7*/
           "movq       (%1), %%mm6\n"   /* 4 blockvals 0 1 2 3 */

           "movq       %%mm0, %%mm2\n"
           "movq       8(%1), %%mm5\n"  /* 4 blockvals 0 1 2 3 */
           "punpcklbw  %%mm4, %%mm0\n"  /* 0 2 4 6 */
           "punpckhbw  %%mm4, %%mm2\n"  /* 1 3 5 7 */

           "paddw      %%mm6, %%mm0\n"
           "paddw      %%mm5, %%mm2\n"
           "packuswb   %%mm2, %%mm0\n"

           "addl       $16, %1\n"
           "movq       %%mm0, (%2)\n"

           "leal       (%2,%3), %2\n"
         "loop       1b\n"
         :              /* scr   dest */
         : "c" (8),"r" (bp), "r" (rfp), "r" (iincr)
       );

//      add_block_mmx(rfp,bp,iincr);
    }
#else
    for (i=0; i<8; i++){
      rfp[0] = clp2[bp[0] + rfp[0]];
      rfp[1] = clp2[bp[1] + rfp[1]];
      rfp[2] = clp2[bp[2] + rfp[2]];
      rfp[3] = clp2[bp[3] + rfp[3]];
      rfp[4] = clp2[bp[4] + rfp[4]];
      rfp[5] = clp2[bp[5] + rfp[5]];
      rfp[6] = clp2[bp[6] + rfp[6]];
      rfp[7] = clp2[bp[7] + rfp[7]];
      rfp+= iincr;
      bp += 8;
    }
#endif
  }
  else {
#ifdef HAVE_MMX
     if (spar) {
            __asm__ __volatile__(
            "movd       (%2),           %%mm0\n"                // " 0 0 0  v1"
            "punpcklwd  %%mm0,          %%mm0\n"    // " 0 0 v1 v1"
            "punpcklwd  %%mm0,          %%mm0\n"
            "paddw      MMX_128,        %%mm0\n"
            "packuswb   %%mm0,          %%mm0\n"
            "leal       (%0,%1,2),      %%eax\n"

            "movq        %%mm0,         (%0, %1)\n"
            "movq        %%mm0,         (%%eax)\n"
            "leal        (%%eax,%1,2),  %0\n"
            "movq        %%mm0,         (%%eax, %1)\n"

            "movq        %%mm0,         (%0)\n"
            "leal        (%0,%1,2),     %%eax\n"
            "movq        %%mm0,         (%0, %1)\n"

            "movq        %%mm0,         (%%eax)\n"
            "movq        %%mm0,         (%%eax, %1)\n"
            :
            : "D" (rfp), "c" (iincr), "b" (bp)
            : "eax");
    }
    else {

       __asm__ __volatile__(
            "movq        MMX_128,%%mm4\n"
            ".align 8\n"
            "1:"
              "movq      (%1),   %%mm0\n"
              "movq      8(%1),  %%mm1\n"
              "paddw     %%mm4,  %%mm0\n"

              "movq      16(%1), %%mm2\n"
              "paddw     %%mm4,  %%mm1\n"

              "movq      24(%1), %%mm3\n"
              "paddw     %%mm4,  %%mm2\n"

              "packuswb  %%mm1,  %%mm0\n"
              "paddw     %%mm4,  %%mm3\n"

              "addl $32, %1\n"
              "packuswb  %%mm3,  %%mm2\n"

              "movq   %%mm0, (%2)\n"

              "movq   %%mm2, (%2,%3)\n"

              "leal       (%2,%3,2), %2\n"
            "loop       1b\n"
            :
            : "c" (4), "r" (bp), "r" (rfp), "r" (iincr)
        );

//      set_block_mmx(rfp,bp,iincr);
    }
#else
    for (i=0; i<8; i++){
      rfp[0] = clp2[bp[0]];
      rfp[1] = clp2[bp[1]];
      rfp[2] = clp2[bp[2]];
      rfp[3] = clp2[bp[3]];
      rfp[4] = clp2[bp[4]];
      rfp[5] = clp2[bp[5]];
      rfp[6] = clp2[bp[6]];
      rfp[7] = clp2[bp[7]];
      rfp+= iincr;
      bp += 8;
    }
#endif
  }
}
