/*
 * server/graph/packed/bitblit.c, part of W
 * (C) 94-02/96 by Torsten Scherer (TeSche)
 * itschere@techfak.uni-bielefeld.de
 *
 * bit blitting routines for the Atari graphics driver
 *
 * CHANGES:
 *
 * - major speedups in W0R8. ++kay, 10/94
 * - added clipping. ++kay, 1/96
 * - again a little speedup, TeSche 02/96
 */

#include <stdio.h>
#include "../../../lib/Wlib.h"
#include "../../config.h"
#include "../../types.h"
#include "../../pakets.h"
#include "../../proto.h"
#include "../clip.h"
#include "packed.h"


/*
 * note: if you're compiling this on any other processor than a motorolla 680x0
 * be aware that parts of this code make long-accesses on short-boundaries, and
 * may thus cause bus-errors if the processor can't do that. so far this hasn't
 * been a problem at all; in contrary the fast bitline2* functions are 4 times
 * faster than the slow ones working with the bfmasks.
 */


/*
 * this now really seems to be very close to the maximum speed achievable with
 * plain c-programming having in mind a 68000 cpu model, i.e. some idea how
 * many registers there're and what variables to put into them. further
 * optimization is *very* tricky, as already slight changes can lead gcc to
 * put other variables into registers than optimal and thus loose 20% of speed
 * without giving you very much of a chance to understand what went wrong.
 *
 * some things I've checked in particular and which are NOT faster than this:
 *
 * - make monochrome version operate on long-boundaries and with bfmask32
 *   (the fast bitline versions operate on short-boundaries anyway and for the
 *   single slow bitline that's done per call it's not worth the trouble)
 *
 * some things I do have checked to be faster than the previous versions:
 *
 * - lots of range checks reduce to bittests rather than compares
 * - get rid of some code because of implicit information about the state of
 *   some variables (all fast bitline function try to put dptr on a short-
 *   boundary fist)
 * - get rid of a redundand variable (use variable sbit for two purposes)
 */


#if 0

/*
 * I'm still wondering if some proper 68020 code mightn't speed this up a final
 * time, but I still didn't check that.
 */

inline void bfchgd(register ulong d, register long offset, register long width)
{
  d ^= bfmask[offset][width-1];
}

inline void bfclrd(register ulong d, register long offset, register long width)
{
  d &= ~bfmask[offset][width-1];
}

inline void bfsetd(register ulong d, register long offset, register long width)
{
  d |= bfmask[offset][width-1];
}

#endif


/*
 *
 */

#ifdef MONOCHROME
inline void FUNCTION(bitline2right)(register ushort *sptr,
				    register long sbit,
				    register ushort *dptr,
				    register long dbit,
				    register long width)
{
  register ushort mask;
  register long todo;

  if (dbit) {
    if ((todo = 16 - dbit) > width) {
      todo = width;
    }

    mask = bfmask16[dbit][todo-1];
    *dptr++ = (*dptr & ~mask) | ((*(ulong *)sptr >> (16 + dbit - sbit)) & mask);

    if ((sbit += todo) & 16) {
      sbit &= 15;
      sptr++;
    }
    /* no need to check for dbit overrun here, as that was precicely our goal
     * and so we can assume that it happened without checking first. in fact
     * we can even more the incrementation of dptr into the upper calculation.
     */
    width -= todo;
  }

  sbit = 16 - sbit;   /* `steal' ;) this variable for other purposes */
  mask = width >> 4;
  while ((short)--mask >= 0) {
    *dptr++ = *(ulong *)sptr++ >> sbit;
  }

  if ((todo = width & 15)) {
    mask = bfmask16[0][todo-1];
    *dptr = (*dptr & ~mask) | ((*(ulong *)sptr >> sbit) & mask);
  }
}
#else
inline void FUNCTION(bitline2right)(sptr, sbit, splanes, dptr, dbit, dplanes, width)
     register ushort *sptr;
     register long sbit;
     register long splanes;
     register ushort *dptr;
     register long dbit;
     register long dplanes;
     register long width;
{
  register ushort mask;
  register long todo;

  if (dbit) {
    if ((todo = (16 - dbit)) > width) {
      todo = width;
    }
    mask = bfmask16[dbit][todo-1];
    *dptr = (*dptr & ~mask) | (((((ulong)sptr[0] << 16) | sptr[splanes]) >> (16 + dbit - sbit)) & mask);
    if ((sbit += todo) & 16) {
      sbit &= 15;
      sptr += splanes;
    }
    dptr += dplanes;
    width -= todo;
  }

  sbit = 16 - sbit;
  mask = width >> 4;
  while ((short)--mask >= 0) {
    *dptr = (((ulong)sptr[0] << 16) | sptr[splanes]) >> sbit;
    sptr += splanes;
    dptr += dplanes;
  }

  if ((todo = width & 15)) {
    mask = bfmask16[0][todo-1];
    *dptr = (*dptr & ~mask) | (((((ulong)sptr[0] << 16) | sptr[splanes]) >> sbit) & mask);
  }
}
#endif


#ifdef MONOCHROME
inline void FUNCTION(bitline2left)(sptr, sbit, dptr, dbit, width)
     register ushort *sptr;
     register long sbit;
     register ushort *dptr;
     register long dbit;
     register long width;
{
  register ushort mask;
  register long todo;

  ++sbit;

  if ((todo = ++dbit & 15)) {
    if (todo > width) {
      todo = width;
    }
    mask = bfmask16[dbit-todo][todo-1];
    *dptr-- = (*dptr & ~mask) | (((*(ulong *)(sptr - 1) << sbit) >> dbit) & mask);
    if ((sbit -= todo) <= 0) {
      sbit += 16;
      sptr--;
    }
    width -= todo;
  }
  sbit = 16 - sbit;

  mask = width >> 4;
  while ((short)--mask >= 0) {
    *dptr-- = *(ulong *)(--sptr) >> sbit;
  }

  if ((todo = width & 15)) {
    mask = bfmask16[16-todo][todo-1];
    *dptr = (*dptr & ~mask) | ((*(ulong *)(sptr - 1) >> sbit) & mask);
  }
}
#else
inline void FUNCTION(bitline2left)(sptr, sbit, splanes, dptr, dbit, dplanes, width)
     register ushort *sptr;
     register long sbit;
     register long splanes;
     register ushort *dptr;
     register long dbit;
     register long dplanes;
     register long width;
{
  register ushort mask;
  register long todo;

  ++sbit; ++dbit;

  if (dbit) {
    if ((todo = dbit & 15) > width) {
      todo = width;
    }
    mask = bfmask16[dbit-todo][todo-1];
    *dptr = (*dptr & ~mask) | ((((((ulong)sptr[-splanes] << 16) | sptr[0]) << sbit) >> dbit) & mask);
    if ((sbit -= todo) <= 0) {
      sbit += 16;
      sptr -= splanes;
    }
    dptr -= dplanes;
    width -= todo;
  }

  sbit = 16 - sbit;
  mask = width >> 4;
  while ((short)--mask >= 0) {
    *dptr = (((ulong)sptr[-splanes] << 16) | sptr[0]) >> sbit;
    dptr -= dplanes;
    sptr -= splanes;
  }

  if ((todo = width & 15)) {
    mask = bfmask16[16-todo][todo-1];
    *dptr = (*dptr & ~mask) | (((((ulong)sptr[-splanes] << 16) | sptr[0]) >> sbit) & mask);
  }
}
#endif


#ifdef MONOCHROME
inline void FUNCTION(slow_bitline2right)(sptr, sbit, dptr, dbit, width)
     register ushort *sptr;
     register long sbit;
     register ushort *dptr;
     register long dbit;
     register long width;
{
  register long	todo;

  while (width > 0) {

    if ((todo = 16 - MAX(sbit, dbit)) > width) {
      todo = width;
    }

    if (sbit > dbit) {
      /* must shift left */
      *dptr = (*dptr & ~bfmask16[dbit][todo-1]) | (*sptr & bfmask16[sbit][todo-1]) << (sbit - dbit);
    } else {
      /* must shift right */
      *dptr = (*dptr & ~bfmask16[dbit][todo-1]) | ((*sptr & bfmask16[sbit][todo-1]) >> (dbit - sbit));
    }

    if ((sbit += todo) & 16) {
      sbit = 0;
      sptr++;
    }

    if ((dbit += todo) & 16) {
      dbit = 0;
      dptr++;
    }

    width -= todo;
  }
}
#else
inline void FUNCTION(slow_bitline2right)(sptr, sbit, splanes, dptr, dbit, dplanes, width)
     register ushort *sptr;
     register long sbit;
     register long splanes;
     register ushort *dptr;
     register long dbit;
     register long dplanes;
     register long width;
{
  register long	todo;

  while (width > 0) {

    if ((todo = 16 - MAX(sbit, dbit)) > width) {
      todo = width;
    }

    if (sbit > dbit) {
      /* must shift left */
      *dptr = (*dptr & ~bfmask16[dbit][todo-1]) | (*sptr & bfmask16[sbit][todo-1]) << (sbit - dbit);
    } else {
      /* must shift right */
      *dptr = (*dptr & ~bfmask16[dbit][todo-1]) | ((*sptr & bfmask16[sbit][todo-1]) >> (dbit - sbit));
    }

    if ((sbit += todo) & 16) {
      sbit = 0;
      sptr += splanes;
    }

    if ((dbit += todo) & 16) {
      dbit = 0;
      dptr += dplanes;
    }

    width -= todo;
  }
}
#endif


#ifdef MONOCHROME
inline void FUNCTION(slow_bitline2left)(sptr, sbit, dptr, dbit, width)
     register ushort *sptr;
     register long sbit;
     register ushort *dptr;
     register long dbit;
     register long width;
{
  register long	todo;

  while (width) {

    if ((todo = MIN(sbit, dbit)) > (width - 1)) {
      todo = width - 1;
    }
    sbit -= todo;
    dbit -= todo;

    if (sbit > dbit) {
      /* must shift left */
      *dptr = (*dptr & ~bfmask16[dbit][todo]) | (*sptr & bfmask16[sbit][todo]) << (sbit - dbit);
    } else {
      /* must shift right */
      *dptr = (*dptr & ~bfmask16[dbit][todo]) | ((*sptr & bfmask16[sbit][todo]) >> (dbit - sbit));
    }

    if ((sbit -= 1) < 0) {
      sbit = 15;
      sptr--;
    }

    if ((dbit -= 1) < 0) {
      dbit = 15;
      dptr--;
    }

    width -= (todo + 1);
  }
}
#else
inline void FUNCTION(slow_bitline2left)(sptr, sbit, splanes, dptr, dbit, dplanes, width)
     register ushort *sptr;
     register long sbit;
     register long splanes;
     register ushort *dptr;
     register long dbit;
     register long dplanes;
     register long width;
{
  register long	todo;

  while (width) {

    if ((todo = MIN(sbit, dbit)) > (width - 1)) {
      todo = width - 1;
    }
    sbit -= todo;
    dbit -= todo;

    if (sbit > dbit) {
      /* must shift left */
      *dptr = (*dptr & ~bfmask16[dbit][todo]) | (*sptr & bfmask16[sbit][todo]) << (sbit - dbit);
    } else {
      /* must shift right */
      *dptr = (*dptr & ~bfmask16[dbit][todo]) | ((*sptr & bfmask16[sbit][todo]) >> (dbit - sbit));
    }

    if ((sbit -= 1) < 0) {
      sbit = 15;
      sptr -= splanes;
    }

    if ((dbit -= 1) < 0) {
      dbit = 15;
      dptr -= dplanes;
    }

    width -= (todo + 1);
  }
}
#endif


/*
 * finally the real bitblk function
 */

void FUNCTION(bitblk)(bm0, x0, y0, width, height, bm1, x1, y1)
     BITMAP *bm0;
     long x0;
     long y0;
     register long width;
     register long height;
     BITMAP *bm1;
     long x1;
     long y1;
{
  register ushort *sptr, *dptr;
  register long supl, dupl;
  long sbit, dbit;

  /* force gcc *not* to put these variables into registers - we need as much
   * registers as possible for the inline functions! TeSche 02/96
   */
  &bm0; &x0; &y0; &bm1; &x1; &y1; &sbit; &dbit;

  if (height <= 0 || width <= 0) {
    return;
  }
  if (CLIP_BITBLIT (x0, y0, width, height, x1, y1, clip0, clip1)) {
    return;
  }

  if (y1 >= y0) {
    y0 += (height - 1);
    y1 += (height - 1);
  }

  supl = bm0->upl;
  dupl = bm1->upl;

  if (x1 < x0) {

    sbit = x0 & 15;
    dbit = x1 & 15;
#ifdef MONOCHROME
    supl <<= 1;
    dupl <<= 1;
    sptr = (ushort *)bm0->data + y0 * supl + (x0 >> 4);
    dptr = (ushort *)bm1->data + y1 * dupl + (x1 >> 4);
#else
    sptr = (ushort *)bm0->data + y0 * supl + (x0 >> 4) * bm0->planes;
    dptr = (ushort *)bm1->data + y1 * dupl + (x1 >> 4) * bm1->planes;
#endif

    if (y1 >= y0) {
      supl = -supl;
      dupl = -dupl;
    }

    /*
     * The last line cannot be copied with the fast bitline,
     * because bitline2right reads data behind the buffer
     * which would cause memprot problems.
     */
    if (supl < 0) {
#ifdef MONOCHROME
      FUNCTION(slow_bitline2right)(sptr, sbit, dptr, dbit, width);
#else
      FUNCTION(slow_bitline2right)(sptr, sbit, bm0->planes, dptr, dbit, bm1->planes, width);
#endif
      sptr += supl;
      dptr += dupl;
    }
    while (--height) {
#ifdef MONOCHROME
      FUNCTION(bitline2right)(sptr, sbit, dptr, dbit, width);
#else
      FUNCTION(bitline2right)(sptr, sbit, bm0->planes, dptr, dbit, bm1->planes, width);
#endif
      sptr += supl;
      dptr += dupl;
    }
    if (supl > 0) {
#ifdef MONOCHROME
      FUNCTION(slow_bitline2right)(sptr, sbit, dptr, dbit, width);
#else
      FUNCTION(slow_bitline2right)(sptr, sbit, bm0->planes, dptr, dbit, bm1->planes, width);
#endif
    }

  } else {

    x0 += (width - 1);
    x1 += (width - 1);
    sbit = x0 & 15;
    dbit = x1 & 15;
#ifdef MONOCHROME
    supl <<= 1;
    dupl <<= 1;
    sptr = (ushort *)bm0->data + y0 * supl + (x0 >> 4);
    dptr = (ushort *)bm1->data + y1 * dupl + (x1 >> 4);
#else
    sptr = (ushort *)bm0->data + y0 * supl + (x0 >> 4) * bm0->planes;
    dptr = (ushort *)bm1->data + y1 * dupl + (x1 >> 4) * bm1->planes;
#endif

    if (y1 >= y0) {
      supl = -supl;
      dupl = -dupl;
    }

    /*
     * The first line cannot be copied with the fast bitline,
     * because bitline2left reads data before the buffer
     * which would cause memprot problems.
     */
    if (supl > 0) {
#ifdef MONOCHROME
      FUNCTION(slow_bitline2left)(sptr, sbit, dptr, dbit, width);
#else
      FUNCTION(slow_bitline2left)(sptr, sbit, bm0->planes, dptr, dbit, bm1->planes, width);
#endif
      sptr += supl;
      dptr += dupl;
    }
    while (--height) {
#ifdef MONOCHROME
      FUNCTION(bitline2left)(sptr, sbit, dptr, dbit, width);
#else
      FUNCTION(bitline2left)(sptr, sbit, bm0->planes, dptr, dbit, bm1->planes, width);
#endif
      sptr += supl;
      dptr += dupl;
    }
    if (supl < 0) {
#ifdef MONOCHROME
      FUNCTION(slow_bitline2left)(sptr, sbit, dptr, dbit, width);
#else
      FUNCTION(slow_bitline2left)(sptr, sbit, bm0->planes, dptr, dbit, bm1->planes, width);
#endif
    }
  }
}
