/* The file system maintains a buffer cache to reduce the number of disk
 * accesses needed.  Whenever a read or write to the disk is done, a check is
 * first made to see if the block is in the cache.  This file manages the
 * cache.
 *
 * The entry points into this file are:
 *   get_block:	    request to fetch a block for reading or writing from cache
 *   put_block:	    return a block previously requested with get_block
 *   alloc_zone:    allocate a new zone (to increase the length of a file)
 *   free_zone:	    release a zone (when a file is removed)
 *   rw_scattered:  read or write a list of blocks from the disk itself
 *   invalidate:    remove all the cache blocks on some device
 *   buf_pool:      initialize the file system buffers
 *   sync_all_bufs: write all dirty buffers to disk
 */

#include "fs.h"
#include <minix/com.h>
#include "assert.h"
INIT_ASSERT
#include "buf.h"
#include "file.h"
#include "fproc.h"
#include "super.h"

FORWARD _PROTOTYPE( void mergesort, (struct buf **al) );
FORWARD _PROTOTYPE( struct buf *alloc_buf, (void) );

PRIVATE struct buf *buf_next;

/*===========================================================================*
 *				get_block				     *
 *===========================================================================*/
PUBLIC struct buf *get_block(dev, block, how)
register dev_t dev;		/* on which device is the block? */
register block_t block;		/* which block is wanted? */
int how;			/* BF_NORMAL, BF_ALLOC, BF_LOOK */
{
/* Check to see if the requested block is in the block cache.  If so, return
 * a pointer to it.  If not, evict some other block and fetch it (unless
 * 'how' is BF_ALLOC).  All the blocks in the cache that are not in use
 * are linked together in a chain, with 'front' pointing to the least recently
 * used block and 'rear' to the most recently used block.  If 'how' is
 * 1, the block being requested will be overwritten in its entirety, so it is
 * only necessary to see if it is in the cache; if it is not, any free buffer
 * will do.  It is not necessary to actually read the block in from disk.
 * If 'how' is BF_LOOK, the block need not be read from the disk,
 * and the device is not to be marked on the block, so callers can tell if
 * the block returned is valid.
 * In addition to the LRU chain, there is also a hash chain to link together
 * blocks whose block numbers end with the same bit strings, for fast lookup.
 */

  int b;
  register struct buf *bp, *ptr, *prev_ptr;

  /* Search the hash chain for (dev, block). */
  b = (int) block & hash_mask;
  for (bp= buf_hash[b]; bp; bp= bp->b_hash)
  {
  	if (bp->b_blocknr == block && bp->b_dev == dev)
  		break;
  }
  if (bp != NULL)
  {
	/* Block needed has been found. */

  	if (bp->b_valid || how == BF_ALLOC)
  	{
  		if (bp->b_count == 0)
			bufs_in_use++;
		bp->b_count++;	/* record that block is in use */
		assert(bp->b_count>0);
		bp->b_usage++;
		return(bp);
	}
  }
  if (how == BF_LOOK)
  	return NULL;	/* No valid block present in the cache. */

  if (bp == NULL)
  {
  	bp= alloc_buf();

	/* Remove the block that was just taken from its hash chain. */
	b = (int) bp->b_blocknr & hash_mask;
	for (ptr= buf_hash[b], prev_ptr= NULL; ptr; prev_ptr= ptr,
		ptr= ptr->b_hash)
	{
		if (ptr == bp)
			break;
	}
	assert(ptr != NULL);
	if (!prev_ptr)
		buf_hash[b]= bp->b_hash;
	else
		prev_ptr->b_hash= bp->b_hash;

	/* If the block taken is dirty, make it clean by writing it to the
	 * disk. Avoid hysterisis by flushing all other dirty blocks for the
	 * same device.
	 */
	if (bp->b_dev != NO_DEV && bp->b_dirt != CLEAN) flushall(bp->b_dev);

	/* Fill in block's parameters and add it to the hash chain where it
	 * goes.
	 */
	bp->b_dev = dev;		/* fill in device number */
	bp->b_blocknr = block;		/* fill in block number */
	b = (int) bp->b_blocknr & hash_mask;
	bp->b_hash = buf_hash[b];
	buf_hash[b] = bp;		/* add to hash list */
	bp->b_valid= 0;
  }
  if (bp->b_count == 0)
	bufs_in_use++;
  bp->b_count++;			/* record that block is being used */
  assert(bp->b_count>0);
  bp->b_usage++;

  /* Go get the requested block unless searching or prefetching. */
  if (dev != NO_DEV && how == BF_NORMAL) {
	bp->b_list = NULL;
	rw_scattered(bp, READING);
  }
  return(bp);			/* return the newly acquired block */
}


/*===========================================================================*
 *				put_block				     *
 *===========================================================================*/
PUBLIC void put_block(bp)
register struct buf *bp;	/* pointer to the buffer to be released */
{
/* Return a block to the list of available blocks. */

  if (bp == NIL_BUF) return;	/* it is easier to check here than in caller */

  assert(bp->b_count>0);
  bp->b_count--;		/* there is one use fewer now */
  bp->b_usage++;
  if (bp->b_count == 0) {
	if (!bp->b_valid) bp->b_usage = 0;	/* don't keep it alive */

	bufs_in_use--;		/* one fewer block buffers in use */
  }
}


/*===========================================================================*
 *				alloc_buf				     *
 *===========================================================================*/
PRIVATE struct buf *alloc_buf()
{
	struct buf *ptr;
	unsigned min_usage;
	int i, u;

#if DEBUG
	static unsigned long search_l, search_c;

	if ((search_c & 65535) == 65535)
	{
		printf("alloc_buf: avg search length= %lu (%lu/%lu)\n",
			search_l/search_c, search_l, search_c);
		if (search_l >= ULONG_MAX/2)
		{
			search_l /= 2;
			search_c /= 2;
		}
	}
#endif

	/* Allocate a new buffer using a 2nd chance algorithm. */
	min_usage= UINT_MAX;
	for (i= 0; i<nr_bufs; i++)
	{
		ptr = buf_next;
		if (++buf_next == buf_limit)
			buf_next= buf;
		if (ptr->b_count != 0)
			continue;		/* buffer in use */
		if ((ptr->b_usage >>= BF_USAGE_SHIFT) == 0)
		{
#if DEBUG
			search_l += i; search_c++;
#endif
			return ptr;		/* found a victim */
		}
		if (ptr->b_usage < min_usage)
			min_usage= ptr->b_usage;
	}
#if DEBUG
	search_l += i; search_c++;
#endif

	/* All block were in use, artifically reduce the usage of all
	 * blocks.
	 */
	for (u= 0; min_usage != 0; u++)
		min_usage >>= 1;

	for (i= 0; i<nr_bufs; i++)
		buf[i].b_usage >>= u;

	for (i= 0; i<nr_bufs; i++)
	{
		ptr = buf_next;
		if (++buf_next == buf_limit)
			buf_next= buf;
		if (ptr->b_count != 0)
			continue;		/* buffer in use */
		if (ptr->b_usage == 0)
			return ptr;		/* found a victim */
	}
	panic("alloc_buf: unable to allocate buffer", NO_NUM);
}


/*===========================================================================*
 *				buf_vrfy_ptr				     *
 *===========================================================================*/
PUBLIC struct buf *buf_vrfy_ptr(ptr, dev, block)
struct buf *ptr;		/* cache pointer */
register dev_t dev;		/* on which device is the block? */
register block_t block;		/* which block is wanted? */
{
	/* Follow an old block pointer to see if the same block as before is
	 * still there.  Otherwise get the proper block.
	 */

	if (ptr != NULL && ptr->b_dev == dev && ptr->b_blocknr == block
		&& ptr->b_valid)
	{
		if (ptr->b_count == 0)
			bufs_in_use++;
		ptr->b_count++;
		assert(ptr->b_count > 0);
		ptr->b_usage++;
		return ptr;
	}
	return get_block(dev, block, BF_NORMAL);
}


/*===========================================================================*
 *				alloc_zone				     *
 *===========================================================================*/
PUBLIC zone_t alloc_zone(dev, z, err_p)
dev_t dev;			/* device where zone wanted */
zone_t z;			/* try to allocate new zone near this one */
int *err_p;			/* pointer to error variable */
{
/* Allocate a new zone on the indicated device and return its number. */

  int major, minor;
  bit_t b, bit;
  struct super_block *sp;

  /* Note that the routine alloc_bit() returns 1 for the lowest possible
   * zone, which corresponds to sp->s_firstdatazone.  To convert a value
   * between the bit number, 'b', used by alloc_bit() and the zone number, 'z',
   * stored in the inode, use the formula:
   *     z = b + sp->s_firstdatazone - 1
   * Alloc_bit() never returns 0, since this is used for NO_BIT (failure).
   */
  sp = get_super(dev);		/* find the super_block for this device */

  /* If z is s_firstdatazone, start searching at s_zsearch. */
  assert(z == 0 || (z >= sp->s_firstdatazone && z < sp->s_zones));
  if (z == 0) {
	bit = sp->s_zsearch;
  } else {
	bit = (bit_t) z - (sp->s_firstdatazone - 1);
  }
  b = alloc_bit(sp, MP_ZONE, bit);
  if (b == NO_BIT) {
	major = (int) (sp->s_dev >> MAJOR) & BYTE;
	minor = (int) (sp->s_dev >> MINOR) & BYTE;
	printf("No space on %sdevice %d/%d\n",
		sp->s_dev == root_dev ? "root " : "", major, minor);
	*err_p = ENOSPC;
	return(NO_ZONE);
  }
  if (z == 0) sp->s_zsearch = b;	/* for next time */
  return(sp->s_firstdatazone - 1 + (zone_t) b);
}


/*===========================================================================*
 *				free_zone				     *
 *===========================================================================*/
PUBLIC void free_zone(dev, numb)
dev_t dev;				/* device where zone located */
zone_t numb;				/* zone to be returned */
{
/* Return a zone. */

  register struct super_block *sp;
  bit_t bit;

  /* Locate the appropriate super_block and return bit. */
  sp = get_super(dev);
  if (numb < sp->s_firstdatazone || numb >= sp->s_zones) {
	printf("trying to free non-data zone %lu\n", numb);
	panic(NULL, NO_NUM);
  }
  bit = (bit_t) (numb - (sp->s_firstdatazone - 1));
  free_bit(sp, MP_ZONE, bit);
  if (!sp->s_nf && bit < sp->s_zsearch) sp->s_zsearch = bit;
}


/*===========================================================================*
 *				invalidate				     *
 *===========================================================================*/
PUBLIC void invalidate(device)
dev_t device;			/* device whose blocks are to be purged */
{
/* Remove all the blocks belonging to some device from the cache. */

  register struct buf *bp;

  for (bp = &buf[0]; bp < buf_limit; bp++)
	if (bp->b_dev == device)
	{
		assert(bp->b_dirt == CLEAN);
		/* Unhash */
		bp->b_dev = NO_DEV;
		bp->b_usage = 0;
	}
}


/*==========================================================================*
 *				flushall				    *
 *==========================================================================*/
PUBLIC void flushall(dev)
dev_t dev;			/* device to flush */
{
/* Flush all dirty blocks for one device. */

  register struct buf *bp;
  struct buf *dirty = NULL;
  struct super_block *sp;

  for (bp = &buf[0]; bp < buf_limit; bp++) {
	if (bp->b_dirt != CLEAN && bp->b_dev == dev) {
		assert(bp->b_valid);
		bp->b_list = dirty;
		dirty = bp;
	}
  }
  if (dirty != NULL) {
	if ((sp = search_super(dev)) != NULL) {
		super_clean(sp, FALSE);
		sp->s_sync_ct = 0;
	}

	/* Sort and write the list of blocks. */
	if (dirty->b_list != NULL) mergesort(&dirty);
	rw_scattered(dirty, WRITING);
  }
}


/*===========================================================================*
 *				rw_scattered				     *
 *===========================================================================*/
PUBLIC void rw_scattered(bufq, rw_flag)
struct buf *bufq;		/* pointer to a list of buffers */
int rw_flag;			/* READING or WRITING */
{
/* Read or write a list of disk blocks.  If an error occurs, a message is
 * printed here, but the error is only reported to the caller on reads.  If
 * the error occurred while purging a block from the cache, it is impossible
 * to tell the process that once wrote the block.
 */

  struct buf *bp, **pbp;
  iovec_t *iop;
  iovec_t iovec[NR_IOREQS];
  int j, r;

  /* Set up I/O vector and do I/O.  Dev_rw() returns the error code if there
   * was an error, but it is only usable if the first vector element is not
   * transferred.  All elements of the vector are updated to the new situation
   * (addresses increased, and sizes dimished by the number of bytes
   * transferred.)  The drivers treat scatter/gather I/O as optional for all
   * but the first segment, but to rw_scattered only reading is optional.
   */  
  while (bufq != NULL) {
	/* Make an I/O vector for the first few consecutive blocks. */
	for (j = 0, iop = iovec, bp = bufq;
		j < NR_IOREQS && bp != NULL
				&& bp->b_blocknr == bufq->b_blocknr + j;
		j++, iop++, bp = bp->b_list)
	{
		iop->iov_addr = (vir_bytes) bp->b_data;
		iop->iov_size = BLOCK_SIZE;
	}

	/* Do the I/O. */
	r = dev_rw(rw_flag == WRITING ? DEV_SCATTER : DEV_GATHER,
		bufq->b_dev, FS_PROC_NR, mul64u(bufq->b_blocknr, BLOCK_SIZE),
		(char *) iovec, j, 0, 0);

	/* Harvest the results. */
	pbp= &bufq;
	for (iop = iovec; j > 0; j--, iop++) {
		bp = *pbp;
		bp->b_valid = 1;		/* validate block */

		if (iop->iov_size != 0) {
			/* An error, EOF, or driver not in the mood. */
			bp->b_valid = 0;	/* invalidate block */

			if (iop != iovec) {
				/* The problem is not with the first block, so
				 * we don't care now.  Retry if writing.
				 */
				pbp = &bp->b_list;
				continue;
			}

			if (r == OK) {
				rdwt_err = END_OF_FILE;
			} else {
				printf(
				"fs: I/O error on device %d/%d, block %lu\n",
					(bp->b_dev>>MAJOR)&BYTE,
					(bp->b_dev>>MINOR)&BYTE,
					bp->b_blocknr);
				rdwt_err = r;
			}
			/* Let the faulty block be skipped. */
		}
		bp->b_dirt = CLEAN;
		*pbp = bp->b_list;
	}
	if (rw_flag == READING) break;		/* reading is optional */
  }
}


/*===========================================================================*
 *				mergesort				     *
 *===========================================================================*/
PRIVATE void mergesort(struct buf **al)
{
/* This is either a stable mergesort, or thermal noise.  (Found in a lisp
 * interpreter written by KJB around 86 when he was young and naive.  The
 * main problem is that it works and is therefore used by KJB everywhere.)
 * It must be called like this: if (L != nil && L->next != nil) mergesort(&L);
 */

  /* static */ struct buf *l1, **mid;  /* Need not be local */
  struct buf *l2;

  l1= *(mid= &(*al)->b_list);
  do {
	if ((l1= l1->b_list) == NULL) break;
	mid= &(*mid)->b_list;
  } while ((l1= l1->b_list) != NULL);

  l2= *mid;
  *mid= NULL;

  if ((*al)->b_list != NULL) mergesort(al);
  if (l2->b_list != NULL) mergesort(&l2);

  l1= *al;
  for (;;) {
	if (l1->b_blocknr <= l2->b_blocknr) {
		if ((l1= *(al= &l1->b_list)) == NULL) {
			*al= l2;
			break;
		}
	} else {
		*al= l2;
		l2= *(al= &l2->b_list);
		*al= l1;
		if (l2 == NULL) break;
	}
  }
}


/*===========================================================================*
 *				buf_pool				     *
 *===========================================================================*/
PUBLIC void buf_pool()
{
/* Initialize the buffer pool.  One can't DMA to virtual memory directly, so
 * a translation to physical memory must be made with buffers breaking up on
 * page boundaries, or double buffering.  The device drivers can handle these
 * problems, but they work more efficiently if buffers are aligned to their
 * size.
 */
  register struct buf *bp;
  register union block *cp;
  phys_bytes address;
  vir_bytes offset;
  int b;

  bufs_in_use = 0;
  buf_next = buf;

  /* Align the cache buffers to their size simply by going down from the
   * unaligned second buffer.  This wastes one of them.
   */
  address = sys_umap(FS_PROC_NR, SEG_D, (vir_bytes) cache, sizeof(cache));
  assert(address != 0);
  offset = address % sizeof(cache[0]);
  cp = (union block *) ((vir_bytes) (cache + 1) - offset);

#if VIRT_MEM
  /* A virtual cache is not of much use, so lock the pages in core. */
  if (sys_vm_lock(FS_PROC_NR, (vir_bytes) cp, nr_bufs * sizeof(cp[0])) != OK)
	panic("can't lock buffer cache", NO_NUM);
#endif

  for (b = 0; b < nr_buf_hash; b++) buf_hash[b] = NIL_BUF;

  for (bp = &buf[0]; bp < buf_limit; bp++) {
	memset(bp, '\0', sizeof(*bp));
	bp->b_blocknr = NO_BLOCK;
	bp->b_dev = NO_DEV;
	bp->b = cp++;
	bp->b_hash = buf_hash[0];
	buf_hash[0] = bp;
  }
}


/*===========================================================================*
 *				sync_all_bufs				     *
 *===========================================================================*/
PUBLIC void sync_all_bufs(dirt_level)
int dirt_level;
{
  struct buf *bp;
  struct super_block *sp;

  /* Write all the dirty blocks to the disk, one drive at a time. */
  for (bp = &buf[0]; bp < buf_limit; bp++) {
	if (bp->b_dev != NO_DEV && bp->b_dirt >= dirt_level) {
		flushall(bp->b_dev);
	}
  }
}

/*
 * $PchId: cache.c,v 1.9 1996/02/29 23:09:24 philip Exp $
 */
