/*
 *   Copyright (c) International Business Machines  Corp., 2002-2003
 *
 *   This program is free software;  you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
 *   the GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program;  if not, write to the Free Software
 *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 *
 * linux/drivers/md/dm-bbr.c
 *
 * Bad-block-relocation (BBR) target for device-mapper.
 *
 * The BBR target is designed to remap I/O write failures to another safe
 * location on disk. Note that most disk drives have BBR built into them,
 * this means that our software BBR will be only activated when all hardware
 * BBR replacement sectors have been used.
 */

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/blkdev.h>
#include <linux/spinlock.h>
#include <linux/smp_lock.h>
#include <linux/slab.h>
#include <linux/mempool.h>
#include "dm.h"
#include "dm-bbr.h"
#include "dm-daemon.h"
#include "dm-io.h"

/* Number of active BBR devices. */
static int bbr_instances = 0;
static DECLARE_MUTEX(bbr_instances_lock);

/* Data pertaining to the I/O thread. */
static struct dm_daemon * bbr_io_thread = NULL;
static spinlock_t bbr_io_list_lock = SPIN_LOCK_UNLOCKED;
static LIST_HEAD(bbr_io_list);
static void bbr_io_handler(void);

/* Global pools for bbr_io_buf's and bbr_remap's. */
static kmem_cache_t * bbr_io_buf_cache;
static mempool_t * bbr_io_buf_pool;
static kmem_cache_t * bbr_remap_cache;
static mempool_t * bbr_remap_pool;

static void bbr_free_remap(struct bbr_private * bbr_id);

/**
 * destroy_pools
 *
 * Delete the pools for the remap list and I/O anchors.
 **/
static void destroy_pools(void)
{
	if (bbr_io_buf_pool) {
		mempool_destroy(bbr_io_buf_pool);
		bbr_io_buf_pool = NULL;
	}
	if (bbr_io_buf_cache) {
		kmem_cache_destroy(bbr_io_buf_cache);
		bbr_io_buf_cache = NULL;
	}
	if (bbr_remap_pool) {
		mempool_destroy(bbr_remap_pool);
		bbr_remap_pool = NULL;
	}
	if (bbr_remap_cache) {
		kmem_cache_destroy(bbr_remap_cache);
		bbr_remap_cache = NULL;
	}
}

/**
 * create_pools
 *
 * Create mempools for the remap list and I/O anchors.
 **/
static int create_pools(void)
{
	if (!bbr_remap_cache) {
		bbr_remap_cache = kmem_cache_create("BBR_Remap_Cache",
						    sizeof(struct bbr_runtime_remap),
						    0, SLAB_HWCACHE_ALIGN,
						    NULL, NULL);
		if (!bbr_remap_cache) {
			DMERR("Unable to create BBR remap cache.");
			goto out;
		}
	}
	if (!bbr_remap_pool) {
		bbr_remap_pool = mempool_create(64, mempool_alloc_slab,
						mempool_free_slab,
						bbr_remap_cache);
		if (!bbr_remap_pool) {
			DMERR("Unable to create BBR remap mempool.");
			goto out;
		}
	}

	if (!bbr_io_buf_cache) {
		bbr_io_buf_cache = kmem_cache_create("BBR_IO_Buf_Cache",
						     sizeof(struct bbr_io_buffer),
						     0, SLAB_HWCACHE_ALIGN,
						     NULL, NULL);
		if (!bbr_io_buf_cache) {
			DMERR("Unable to create BBR I/O buffer cache.");
			goto out;
		}
	}
	if (!bbr_io_buf_pool) {
		bbr_io_buf_pool = mempool_create(256, mempool_alloc_slab,
						 mempool_free_slab,
						 bbr_io_buf_cache);
		if (!bbr_io_buf_pool) {
			DMERR("Unable to create BBR I/O buffer mempool.");
			goto out;
		}
	}

out:
	if (!bbr_remap_cache  || !bbr_remap_pool ||
	    !bbr_io_buf_cache || !bbr_io_buf_pool ) {
		destroy_pools();
		return -ENOMEM;
	}

	return 0;
}

/**
 * stop_io_thread
 *
 * Use the dm-daemon services to stop the BBR I/O thread.
 **/
static void stop_io_thread(void)
{
	if (bbr_io_thread) {
		dm_daemon_stop(bbr_io_thread);
		kfree(bbr_io_thread);
		bbr_io_thread = NULL;
	}
}

/**
 * stop_io_thread
 *
 * Use the dm-daemon services to start the BBR I/O thread.
 **/
static int start_io_thread(void)
{
	int rc;

	if (!bbr_io_thread) {
		bbr_io_thread = kmalloc(sizeof(*bbr_io_thread), GFP_KERNEL);
		if (!bbr_io_thread) {
			return -ENOMEM;
		}

		rc = dm_daemon_start(bbr_io_thread, "bbr_io", bbr_io_handler);
		if (rc) {
			kfree(bbr_io_thread);
			return rc;
		}
	}

	return 0;
}

/**
 * bbr_global_init
 *
 * Set up the mempools, I/O thread, and sync-I/O service. This should
 * be called only when the first bbr device is created.
 **/
static int bbr_global_init(void)
{
	int rc;

	rc = create_pools();
	if (rc) {
		goto out;
	}

	rc = start_io_thread();
	if (rc) {
		destroy_pools();
		goto out;
	}

	rc = dm_io_get(1);
	if (rc) {
		destroy_pools();
		stop_io_thread();
		goto out;
	}

out:
	return rc;
}

/**
 * bbr_global_cleanup
 *
 * Cleanup the mempools, I/O thread and sync-I/O service. This should
 * be called only when the last bbr device is removed.
 **/
static void bbr_global_cleanup(void)
{
	destroy_pools();
	stop_io_thread();
	dm_io_put(1);
}

static struct bbr_private * bbr_alloc_private(void)
{
	struct bbr_private * bbr_id;

	bbr_id = kmalloc(sizeof(*bbr_id), GFP_KERNEL);
	if (bbr_id) {
		memset(bbr_id, 0, sizeof(*bbr_id));
		bbr_id->in_use_replacement_blks = (atomic_t)ATOMIC_INIT(0);
		bbr_id->bbr_id_lock = SPIN_LOCK_UNLOCKED;
	}
	
	return bbr_id;
}

static void bbr_free_private(struct bbr_private * bbr_id)
{
	if (bbr_id->bbr_table) {
		kfree(bbr_id->bbr_table);
	}
	bbr_free_remap(bbr_id);
	kfree(bbr_id);
}

static u32 crc_table[256];
static u32 crc_table_built = 0;

static void build_crc_table(void)
{
	u32 i, j, crc;

	for (i = 0; i <= 255; i++) {
		crc = i;
		for (j = 8; j > 0; j--) {
			if (crc & 1)
				crc = (crc >> 1) ^ CRC_POLYNOMIAL;
			else
				crc >>= 1;
		}
		crc_table[i] = crc;
	}
	crc_table_built = 1;
}

static u32 calculate_crc(u32 crc, void * buffer, u32 buffersize)
{
	unsigned char * current_byte;
	u32 temp1, temp2, i;

	current_byte = (unsigned char *) buffer;
	/* Make sure the crc table is available */
	if (!crc_table_built)
		build_crc_table();
	/* Process each byte in the buffer. */
	for (i = 0; i < buffersize; i++) {
		temp1 = (crc >> 8) & 0x00FFFFFF;
		temp2 = crc_table[(crc ^ (u32) * current_byte) &
				  (u32) 0xff];
		current_byte++;
		crc = temp1 ^ temp2;
	}
	return crc;
}

/**
 * le_bbr_table_sector_to_cpu
 *
 * Convert bbr meta data from on-disk (LE) format
 * to the native cpu endian format.
 **/
static void le_bbr_table_sector_to_cpu(struct bbr_table * p)
{
	int i;
	p->signature		= le32_to_cpup(&p->signature);
	p->crc			= le32_to_cpup(&p->crc);
	p->sequence_number	= le32_to_cpup(&p->sequence_number);
	p->in_use_cnt		= le32_to_cpup(&p->in_use_cnt);
	for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) {
		p->entries[i].bad_sect =
			le64_to_cpup(&p->entries[i].bad_sect);
		p->entries[i].replacement_sect =
			le64_to_cpup(&p->entries[i].replacement_sect);
	}
}

/**
 * cpu_bbr_table_sector_to_le
 *
 * Convert bbr meta data from cpu endian format to on-disk (LE) format
 **/
static void cpu_bbr_table_sector_to_le(struct bbr_table * p,
				       struct bbr_table * le)
{
	int i;
	le->signature		= cpu_to_le32p(&p->signature);
	le->crc			= cpu_to_le32p(&p->crc);
	le->sequence_number	= cpu_to_le32p(&p->sequence_number);
	le->in_use_cnt		= cpu_to_le32p(&p->in_use_cnt);
	for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) {
		le->entries[i].bad_sect =
			cpu_to_le64p(&p->entries[i].bad_sect);
		le->entries[i].replacement_sect =
			cpu_to_le64p(&p->entries[i].replacement_sect);
	}
}

/**
 * validate_bbr_table_sector
 *
 * Check the specified BBR table sector for a valid signature and CRC. If it's
 * valid, endian-convert the table sector.
 **/
static int validate_bbr_table_sector(struct bbr_table * p)
{
	int rc = 0;
	int org_crc, final_crc;

	if (le32_to_cpup(&p->signature) != BBR_TABLE_SIGNATURE) {
		DMERR("BBR table signature doesn't match!");
		DMERR("Found 0x%x. Expecting 0x%x",
		      le32_to_cpup(&p->signature), BBR_TABLE_SIGNATURE);
		rc = -EINVAL;
		goto out;
	}

	if (!p->crc) {
		DMERR("BBR table sector has no CRC!");
		rc = -EINVAL;
		goto out;
	}

	org_crc = le32_to_cpup(&p->crc);
	p->crc = 0;
	final_crc = calculate_crc(INITIAL_CRC, (void *)p, sizeof(*p));
	if (final_crc != org_crc) {
		DMERR("CRC failed!");
		DMERR("Found 0x%x. Expecting 0x%x",
		      org_crc, final_crc);
		rc = -EINVAL;
		goto out;
	}

	p->crc = cpu_to_le32p(&org_crc);
	le_bbr_table_sector_to_cpu(p);

out:
	return rc;
}

/**
 * bbr_binary_tree_insert
 *
 * Insert a node into the binary tree.
 **/
static void bbr_binary_tree_insert(struct bbr_runtime_remap ** root,
				   struct bbr_runtime_remap * newnode)
{
	struct bbr_runtime_remap ** node = root;
	while (node && *node) {
		if (newnode->remap.bad_sect > (*node)->remap.bad_sect) {
			node = &((*node)->right);
		} else {
			node = &((*node)->left);
		}
	}
       
	newnode->left = newnode->right = NULL;
	*node = newnode;
}

/**
 * bbr_binary_search
 *
 * Search for a node that contains bad_sect == lsn.
 **/
static struct bbr_runtime_remap * bbr_binary_search(
	struct bbr_runtime_remap * root,
	u64 lsn)
{
	struct bbr_runtime_remap * node = root;
	while (node) {
		if (node->remap.bad_sect == lsn) {
			break;
		}
		if (lsn > node->remap.bad_sect) {
			node = node->right;
		} else {
			node = node->left;
		}
	}
	return node;
}

/**
 * bbr_binary_tree_destroy
 *
 * Destroy the binary tree.
 **/
static void bbr_binary_tree_destroy(struct bbr_runtime_remap * root,
				    struct bbr_private * bbr_id)
{
	struct bbr_runtime_remap ** link = NULL;
	struct bbr_runtime_remap * node = root;

	while (node) {
		if (node->left) {
			link = &(node->left);
			node = node->left;
			continue;
		}
		if (node->right) {
			link = &(node->right);
			node = node->right;
			continue;
		}

		mempool_free(node, bbr_remap_pool);
		if (node == root) {
			/* If root is deleted, we're done. */
			break;
		}

		/* Back to root. */
		node = root;
		*link = NULL;
	}
}

static void bbr_free_remap(struct bbr_private * bbr_id)
{
	spin_lock_irq(&bbr_id->bbr_id_lock);   
	bbr_binary_tree_destroy(bbr_id->remap_root, bbr_id);
	bbr_id->remap_root = NULL;
	spin_unlock_irq(&bbr_id->bbr_id_lock);
}

/**
 * bbr_insert_remap_entry
 *
 * Create a new remap entry and add it to the binary tree for this node.
 **/
static int bbr_insert_remap_entry(struct bbr_private * bbr_id,
				  struct bbr_table_entry * new_bbr_entry)
{
	struct bbr_runtime_remap * newnode;

	newnode = mempool_alloc(bbr_remap_pool, GFP_NOIO);
	if (!newnode) {
		DMERR("Could not allocate from remap mempool!");
		return -ENOMEM;
	}
	newnode->remap.bad_sect  = new_bbr_entry->bad_sect;
	newnode->remap.replacement_sect = new_bbr_entry->replacement_sect;
	spin_lock_irq(&bbr_id->bbr_id_lock);
	bbr_binary_tree_insert(&bbr_id->remap_root, newnode);
	spin_unlock_irq(&bbr_id->bbr_id_lock);
	return 0;
}

/**
 * bbr_table_to_remap_list
 *
 * The on-disk bbr table is sorted by the replacement sector LBA. In order to
 * improve run time performance, the in memory remap list must be sorted by
 * the bad sector LBA. This function is called at discovery time to initialize
 * the remap list. This function assumes that at least one copy of meta data
 * is valid.
 **/
static u32 bbr_table_to_remap_list(struct bbr_private * bbr_id)
{
	u32 in_use_blks = 0;
	int i, j;
	struct bbr_table * p;
       

	for (i = 0, p = bbr_id->bbr_table;
	     i < bbr_id->nr_sects_bbr_table;
	     i++, p++ ) {
		if (!p->in_use_cnt) {
			break;
		}
		in_use_blks += p->in_use_cnt;
		for (j = 0; j < p->in_use_cnt; j++) {
			bbr_insert_remap_entry(bbr_id, &p->entries[j]);
		}
	}
	if (in_use_blks)
		DMWARN("There are %u BBR entries for device %u:%u",
		       in_use_blks, MAJOR(bbr_id->dev->dev),
		       MINOR(bbr_id->dev->dev));

	return in_use_blks;
}

/**
 * bbr_search_remap_entry
 *
 * Search remap entry for the specified sector. If found, return a pointer to
 * the table entry. Otherwise, return NULL.
 **/
static struct bbr_table_entry * bbr_search_remap_entry(
	struct bbr_private * bbr_id,
	u64 lsn)
{
	struct bbr_runtime_remap * p;

	spin_lock_irq(&bbr_id->bbr_id_lock);
	p = bbr_binary_search(bbr_id->remap_root, lsn);
	spin_unlock_irq(&bbr_id->bbr_id_lock);
	if (p) {
		return (&p->remap);
	} else {
		return NULL;
	}
}

/**
 * bbr_remap
 *
 * If *lsn is in the remap table, return TRUE and modify *lsn,
 * else, return FALSE.
 **/
static inline int bbr_remap(struct bbr_private * bbr_id,
			    u64 * lsn)
{
	struct bbr_table_entry * e;

	if (atomic_read(&bbr_id->in_use_replacement_blks)) {
		e = bbr_search_remap_entry(bbr_id, *lsn);
		if (e) {
			*lsn = e->replacement_sect;
			return 1;
		}
	}
	return 0;
}

/**
 * bbr_remap_probe
 *
 * If any of the sectors in the range [lsn, lsn+nr_sects] are in the remap
 * table return TRUE, Else, return FALSE.
 **/
static inline int bbr_remap_probe(struct bbr_private * bbr_id,
				  u64 lsn, u64 nr_sects)
{
	u64 tmp, cnt;

	if (atomic_read(&bbr_id->in_use_replacement_blks)) {
		for (cnt = 0, tmp = lsn;
		     cnt < nr_sects;
		     cnt += bbr_id->blksize_in_sects, tmp = lsn + cnt) {
			if (bbr_remap(bbr_id,&tmp)) {
				return 1;
			}
		}
	}
	return 0;
}

/**
 * bbr_setup
 *
 * Read the remap tables from disk and set up the initial remap tree.
 **/
static int bbr_setup(struct bbr_private * bbr_id)
{
	struct bbr_table * table = bbr_id->bbr_table;
	struct page * page;
	struct io_region job;
	unsigned int error, offset;
	int i, rc = 0;

	job.dev = bbr_id->dev->dev;
	job.count = 1;

	/* Read and verify each BBR table sector individually. */
	for (i = 0; i < bbr_id->nr_sects_bbr_table; i++, table++) {
		job.sector = bbr_id->lba_table1 + i;
		page = virt_to_page(table);
		offset = (unsigned long)table & ~PAGE_MASK;
		rc = dm_io_sync(1, &job, READ, page, offset, &error);
		if (rc && bbr_id->lba_table2) {
			job.sector = bbr_id->lba_table2 + i;
			rc = dm_io_sync(1, &job, READ, page, offset, &error);
		}
		if (rc) {
			goto out;
		}

		rc = validate_bbr_table_sector(table);
		if (rc) {
			goto out;
		}
	}
	atomic_set(&bbr_id->in_use_replacement_blks,
		   bbr_table_to_remap_list(bbr_id));

out:
	if (rc) {
		DMERR("dm-bbr: error during device setup: %d", rc);
	}
	return rc;
}

static struct bbr_io_buffer * allocate_bbr_io_buf(struct bbr_private * bbr_id,
						  struct buffer_head * bh,
						  int rw)
{
	struct bbr_io_buffer * bbr_io_buf;

	bbr_io_buf = mempool_alloc(bbr_io_buf_pool, GFP_NOIO);
	if (bbr_io_buf) {
		memset(bbr_io_buf, 0, sizeof(struct bbr_io_buffer));
		INIT_LIST_HEAD(&bbr_io_buf->bbr_io_list);
		bbr_io_buf->bbr_id = bbr_id;
		bbr_io_buf->sector = bh->b_rsector;
		bbr_io_buf->bh = bh;
		bbr_io_buf->rw = rw;
	} else {
		DMWARN("Could not allocate from BBR I/O buffer pool!");
	}
	return bbr_io_buf;
}

static void free_bbr_io_buf(struct bbr_io_buffer * bbr_io_buf)
{
	mempool_free(bbr_io_buf, bbr_io_buf_pool);
}

/**
 * bbr_io_remap_error
 * @bbr_id:		Private data for the BBR node.
 * @rw:			READ or WRITE.
 * @starting_lsn:	Starting sector of request to remap.
 * @count:		Number of sectors in the request.
 * @buffer:		Data buffer for the request.
 *
 * For the requested range, try to write each sector individually. For each
 * sector that fails, find the next available remap location and write the
 * data to that new location. Then update the table and write both copies
 * of the table to disk. Finally, update the in-memory mapping and do any
 * other necessary bookkeeping.
 **/
static int bbr_io_remap_error(struct bbr_private * bbr_id,
			      int rw,
			      u64 starting_lsn,
			      u64 count,
			      char * buffer)
{
	struct bbr_table * bbr_table;
	struct io_region job;
	struct page * page;
	unsigned long table_sector_index;
	unsigned long table_sector_offset;
	unsigned long index;
	unsigned int offset_in_page, error;
	u64 lsn, new_lsn;
	int rc;

	if (rw == READ) {
		/* Nothing can be done about read errors. */
		return -EIO;
	}

	job.dev = bbr_id->dev->dev;

	/* For each sector in the request. */
	for (lsn = 0; lsn < count; lsn++, buffer += SECTOR_SIZE) {
		job.sector = starting_lsn + lsn;
		job.count = 1;
		page = virt_to_page(buffer);
		offset_in_page = (unsigned long)buffer & ~PAGE_MASK;
		rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error);
		while (rc) {
			/* Find the next available relocation sector. */
			new_lsn = atomic_read(&bbr_id->in_use_replacement_blks);
			if (new_lsn >= bbr_id->nr_replacement_blks) {
				/* No more replacement sectors available. */
				return -EIO;
			}
			new_lsn += bbr_id->start_replacement_sect;

			/* Write the data to its new location. */
			DMWARN("dm-bbr: device %u:%u: Trying to remap bad sector "PFU64" to sector "PFU64,
			       MAJOR(bbr_id->dev->dev), MINOR(bbr_id->dev->dev),
			       starting_lsn + lsn, new_lsn);
			job.sector = new_lsn;
			rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error);
			if (rc) {
				/* This replacement sector is bad.
				 * Try the next one.
				 */
				DMERR("dm-bbr: device %u:%u: replacement sector "PFU64" is bad. Skipping.",
				      MAJOR(bbr_id->dev->dev), MINOR(bbr_id->dev->dev), new_lsn);
				atomic_inc(&bbr_id->in_use_replacement_blks);
				continue;
			}

			/* Add this new entry to the on-disk table. */
			table_sector_index = new_lsn -
					     bbr_id->start_replacement_sect;
			table_sector_offset = table_sector_index /
					      BBR_ENTRIES_PER_SECT;
			index = table_sector_index % BBR_ENTRIES_PER_SECT;

			bbr_table = &bbr_id->bbr_table[table_sector_offset];
			bbr_table->entries[index].bad_sect = starting_lsn + lsn;
			bbr_table->entries[index].replacement_sect = new_lsn;
			bbr_table->in_use_cnt++;
			bbr_table->sequence_number++;
			bbr_table->crc = 0;
			bbr_table->crc = calculate_crc(INITIAL_CRC,
						       bbr_table,
						       sizeof(struct bbr_table));

			/* Write the table to disk. */
			cpu_bbr_table_sector_to_le(bbr_table, bbr_table);
			page = virt_to_page(bbr_table);
			offset_in_page = (unsigned long)bbr_table & ~PAGE_MASK;
			if (bbr_id->lba_table1) {
				job.sector = bbr_id->lba_table1 + table_sector_offset;
				job.count = 1;
				rc = dm_io_sync(1, &job, WRITE, page, offset_in_page, &error);
			}
			if (bbr_id->lba_table2) {
				job.sector = bbr_id->lba_table2 + table_sector_offset;
				rc |= dm_io_sync(1, &job, WRITE, page, offset_in_page, &error);
			}
			le_bbr_table_sector_to_cpu(bbr_table);

			if (rc) {
				/* Error writing one of the tables to disk. */
				DMERR("dm-bbr: device %u:%u: error updating BBR tables on disk.",
				      MAJOR(bbr_id->dev->dev), MINOR(bbr_id->dev->dev));
				return rc;
			}

			/* Insert a new entry in the remapping binary-tree. */
			rc = bbr_insert_remap_entry(bbr_id,
						    &bbr_table->entries[index]);
			if (rc) {
				DMERR("dm-bbr: device %u:%u: error adding new entry to remap tree.",
				      MAJOR(bbr_id->dev->dev), MINOR(bbr_id->dev->dev));
				return rc;
			}

			atomic_inc(&bbr_id->in_use_replacement_blks);
		}
	}

	return 0;
}

/**
 * bbr_io_process_request
 *
 * For each sector in this request, check if the sector has already
 * been remapped. If so, process all previous sectors in the request,
 * followed by the remapped sector. Then reset the starting lsn and
 * count, and keep going with the rest of the request as if it were
 * a whole new request. If any of the sync_io's return an error,
 * call the remapper to relocate the bad sector(s).
 **/
static int bbr_io_process_request(struct bbr_io_buffer * bbr_io_buf)
{
	struct bbr_private * bbr_id = bbr_io_buf->bbr_id;
	struct io_region job;
	u64 starting_lsn = bbr_io_buf->sector;
	u64 count = bbr_io_buf->bh->b_size >> SECTOR_SHIFT;
	u64 lsn, remapped_lsn;
	char * buffer = bbr_io_buf->bh->b_data;
	struct page * page = virt_to_page(buffer);
	unsigned int offset_in_page = (unsigned long)buffer & ~PAGE_MASK;
	unsigned int error;
	int rw = bbr_io_buf->rw;
	int rc = 0;

	job.dev = bbr_id->dev->dev;

	/* For each sector in this request, check if this sector has already
	 * been remapped. If so, process all previous sectors in this request,
	 * followed by the remapped sector. Then reset the starting lsn and
	 * count and keep going with the rest of the request as if it were
	 * a whole new request.
	 */
	for (lsn = 0; lsn < count; lsn++) {
		remapped_lsn = starting_lsn + lsn;
		rc = bbr_remap(bbr_id, &remapped_lsn);
		if (!rc) {
			/* This sector is fine. */
			continue;
		}

		/* Process all sectors in the request up to this one. */
		if (lsn > 0) {
			job.sector = starting_lsn;
			job.count = lsn;
			rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error);
			if (rc) {
				/* If this I/O failed, then one of the sectors
				 * in this request needs to be relocated.
				 */
				rc = bbr_io_remap_error(bbr_id, bbr_io_buf->rw, starting_lsn,
							lsn, buffer);
				if (rc) {
					return rc;
				}
			}
			buffer += (lsn << SECTOR_SHIFT);
			page = virt_to_page(buffer);
			offset_in_page = (unsigned long)buffer & ~PAGE_MASK;
		}

		/* Process the remapped sector. */
		job.sector = remapped_lsn;
		job.count = 1;
		rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error);
		if (rc) {
			/* BUGBUG - Need more processing if this caused an
			 * an error. If this I/O failed, then the existing
			 * remap is now bad, and we need to find a new remap.
			 * Can't use bbr_io_remap_error(), because the existing
			 * map entry needs to be changed, not added again, and
			 * the original table entry also needs to be changed.
			 */
			return rc;
		}

		buffer		+= SECTOR_SIZE;
		starting_lsn	+= (lsn + 1);
		count		-= (lsn + 1);
		lsn		= -1;
		page		= virt_to_page(buffer);
		offset_in_page	= (unsigned long)buffer & ~PAGE_MASK;
	}

	/* Check for any remaining sectors after the last split. This could
	 * potentially be the whole request, but that should be a rare case
	 * because requests should only be processed by the thread if we know
	 * an error occurred or they contained one or more remapped sectors.
	 */
	if (count) {
		job.sector = starting_lsn;
		job.count = count;
		rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error);
		if (rc) {
			/* If this I/O failed, then one of the sectors in this
			 * request needs to be relocated.
			 */
			rc = bbr_io_remap_error(bbr_id, bbr_io_buf->rw, starting_lsn,
						count, buffer);
			if (rc) {
				return rc;
			}
		}
	}

	return 0;
}

/**
 * bbr_io_handler
 *
 * This is the handler for the bbr_io_thread. It continuously loops,
 * taking I/O requests off its list and processing them. If nothing
 * is on the list, the thread goes back to sleep until specifically
 * woken up.
 *
 * I/O requests should only be sent to this thread if we know that:
 * a) the request contains at least one remapped sector.
 *   or
 * b) the request caused an error on the normal I/O path.
 * This function uses synchronous I/O, so sending a request to this
 * thread that doesn't need special processing will cause severe
 * performance degredation.
 **/
static void bbr_io_handler(void)
{
	struct bbr_io_buffer * bbr_io_buf;
	struct buffer_head * bh;
	unsigned long flags;
	int rc;

	while (1) {
		/* Process bbr_io_list, one entry at a time. */
		spin_lock_irqsave(&bbr_io_list_lock, flags);
		if (list_empty(&bbr_io_list)) {
			/* No more items on the list. */
			spin_unlock_irqrestore(&bbr_io_list_lock, flags);
			break;
		}
		bbr_io_buf = list_entry(bbr_io_list.next,
					struct bbr_io_buffer, bbr_io_list);
		list_del_init(&bbr_io_buf->bbr_io_list);
		spin_unlock_irqrestore(&bbr_io_list_lock, flags);

		rc = bbr_io_process_request(bbr_io_buf);

		/* Clean up and complete the original I/O. */
		bbr_io_buf->flags |= BBR_IO_HANDLED;
		bh = bbr_io_buf->bh;
		if (bh->b_end_io) {
			/* If this was the bbr_io_buf for an error on the
			 * normal WRITE, don't free it here. It will be
			 * freed later in bbr_callback()
			 */
			if (!(bbr_io_buf->flags & BBR_IO_RELOCATE))
				free_bbr_io_buf(bbr_io_buf);
			bh->b_end_io(bh, rc ? 0 : 1);
		}
	}
}

/**
 * bbr_schedule_io
 *
 * Place the specified bbr_io_buf on the thread's processing list.
 **/
static void bbr_schedule_io(struct bbr_io_buffer * bbr_io_buf)
{
	unsigned long flags;
	spin_lock_irqsave(&bbr_io_list_lock, flags);
	list_add_tail(&bbr_io_buf->bbr_io_list, &bbr_io_list);
	spin_unlock_irqrestore(&bbr_io_list_lock, flags);
	dm_daemon_wake(bbr_io_thread);
}

/**
 * bbr_read
 *
 * If there are any remapped sectors on this object, send this request over
 * to the thread for processing. Otherwise send it down the stack normally.
 **/
static int bbr_read(struct bbr_private * bbr_id,
		    struct buffer_head * bh)
{
	struct bbr_io_buffer * bbr_io_buf;


	if (atomic_read(&bbr_id->in_use_replacement_blks) == 0 ||
	    !bbr_remap_probe(bbr_id, bh->b_rsector,
			     bh->b_size >> SECTOR_SHIFT)) {
		/* No existing remaps or this request doesn't
		 * contain any remapped sectors.
		 */
		bh->b_rdev = bbr_id->dev->dev;
		return 1;
	}

	/* This request has at least one remapped sector. */
	bbr_io_buf = allocate_bbr_io_buf(bbr_id, bh, READ);
	if (!bbr_io_buf) {
		/* Can't get memory to track the I/O. */
		bh->b_end_io(bh, 0);
		return -ENOMEM;
	}

	bbr_schedule_io(bbr_io_buf);
	return 0;
}

/**
 * bbr_callback
 *
 * This is the callback for normal write requests. Check for an error
 * during the I/O, and send to the thread for processing if necessary.
 **/
static int bbr_callback(struct dm_target * ti,
			struct buffer_head * bh,
			int rw,
			int error,
			union map_info * map_context)
{
	struct bbr_io_buffer * bbr_io_buf = (struct bbr_io_buffer *) map_context->ptr;

	if (!bbr_io_buf)
		return error;

	/* Will try to relocate the WRITE if:
	 * - It is an error, and
	 * - It is not an error of BBR relocation, and
	 */
	if (error && !(bbr_io_buf->flags & BBR_IO_HANDLED)) {
		DMERR("dm-bbr: device %u:%u: Write failure on sector %lu. Scheduling for retry.",
		      MAJOR(bh->b_rdev), MINOR(bh->b_rdev),
		      (unsigned long)bbr_io_buf->sector);
		/* Indicate this bbr_io_buf is for an error on normal WRITE */
		bbr_io_buf->flags |= BBR_IO_RELOCATE;
		bbr_schedule_io(bbr_io_buf);
		/* Returns >0 so that DM will let us retry the I/O */
		return 1;
	}

	free_bbr_io_buf(bbr_io_buf);
	return error;
}

/**
 * bbr_write
 *
 * If there are any remapped sectors on this object, send the request over
 * to the thread for processing. Otherwise, register for callback
 * notification, and send the request down normally.
 **/
static int bbr_write(struct bbr_private * bbr_id,
		     struct buffer_head * bh,
		     union map_info * map_context)
{
	struct bbr_io_buffer * bbr_io_buf;

	bbr_io_buf = allocate_bbr_io_buf(bbr_id, bh, WRITE);
	if (!bbr_io_buf) {
		/* Can't get memory to track the I/O. */
		bh->b_end_io(bh, 0);
		return -ENOMEM;
	}

	if (atomic_read(&bbr_id->in_use_replacement_blks) == 0 ||
	    !bbr_remap_probe(bbr_id, bh->b_rsector,
			     bh->b_size >> SECTOR_SHIFT)) {
		/* No existing remaps or this request
		 * contains no remapped sectors.
		 */
		bh->b_rdev = bbr_id->dev->dev;
		map_context->ptr = bbr_io_buf;
		return 1;
	} else {
		/* This request contains at least one remapped sector. */
		map_context->ptr = NULL;
		bbr_schedule_io(bbr_io_buf);
	}
	return 0;
}

/**
 * Construct a bbr mapping
 **/
static int bbr_ctr(struct dm_target * ti, unsigned int argc, char ** argv)
{
	struct bbr_private * bbr_id;
	u32 block_size;
	char * end;
	int rc = -EINVAL;

	if (argc != 8) {
		ti->error = "dm-bbr requires exactly 8 arguments: "
			    "device offset table1_lsn table2_lsn table_size start_replacement nr_replacement_blks block_size";
		goto out1;
	}

	bbr_id = bbr_alloc_private();
	if (!bbr_id) {
		ti->error = "dm-bbr: Error allocating bbr private data.";
		goto out1;
	}

	bbr_id->offset = simple_strtoull(argv[1], &end, 10);
	bbr_id->lba_table1 = simple_strtoull(argv[2], &end, 10);
	bbr_id->lba_table2 = simple_strtoull(argv[3], &end, 10);
	bbr_id->nr_sects_bbr_table = simple_strtoull(argv[4], &end, 10);
	bbr_id->start_replacement_sect = simple_strtoull(argv[5], &end, 10);
	bbr_id->nr_replacement_blks = simple_strtoull(argv[6], &end, 10);
	block_size = simple_strtoul(argv[7], &end, 10);
	bbr_id->blksize_in_sects = (block_size >> SECTOR_SHIFT);

	bbr_id->bbr_table = kmalloc(bbr_id->nr_sects_bbr_table << SECTOR_SHIFT,
				    GFP_KERNEL);
	if (!bbr_id->bbr_table) {
		ti->error = "dm-bbr: Error allocating bbr table.";
		goto out2;
	}

	if (dm_get_device(ti, argv[0], 0, ti->len,
			  dm_table_get_mode(ti->table), &bbr_id->dev)) {
		ti->error = "dm-bbr: Device lookup failed";
		goto out2;
	}

	/* Using a semaphore here is probably overkill,
	 * but at least it will be correct.
	 */
	down(&bbr_instances_lock);
	if (bbr_instances == 0) {
		rc = bbr_global_init();
		if (rc) {
			up(&bbr_instances_lock);
			goto out3;
		}
	}
	bbr_instances++;
	up(&bbr_instances_lock);

	rc = bbr_setup(bbr_id);
	if (rc) {
		ti->error = "dm-bbr: Device setup failed";
		goto out4;
	}

	ti->private = bbr_id;
	return 0;

out4:
	down(&bbr_instances_lock);
	bbr_instances--;
	if (bbr_instances == 0) {
		bbr_global_cleanup();
	}
	up(&bbr_instances_lock);

out3:
	dm_put_device(ti, bbr_id->dev);
out2:
	bbr_free_private(bbr_id);
out1:
	return rc;
}

static void bbr_dtr(struct dm_target * ti)
{
	struct bbr_private * bbr_id = (struct bbr_private *) ti->private;

	dm_put_device(ti, bbr_id->dev);
	bbr_free_private(bbr_id);

	down(&bbr_instances_lock);
	bbr_instances--;
	if (bbr_instances == 0) {
		bbr_global_cleanup();
	}
	up(&bbr_instances_lock);
}

static int bbr_map(struct dm_target * ti, struct buffer_head * bh, int rw,
		   union map_info * map_context)
{
	struct bbr_private * bbr_id = (struct bbr_private *) ti->private;

	bh->b_rsector += bbr_id->offset;
	switch (rw) {
		case READ:
		case READA:
			map_context->ptr = NULL;
			return bbr_read(bbr_id, bh);
		case WRITE:
			return bbr_write(bbr_id, bh, map_context);
		default:
			return -EIO;
	}
}

static int bbr_status(struct dm_target * ti, status_type_t type,
		      char * result, unsigned int maxlen)
{
	struct bbr_private * bbr_id = (struct bbr_private *) ti->private;

	switch (type) {
	case STATUSTYPE_INFO:
		result[0] = '\0';
		break;

	case STATUSTYPE_TABLE:
		snprintf(result, maxlen, "%s "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" %u",
			 dm_kdevname(bbr_id->dev->dev), bbr_id->offset,
			 bbr_id->lba_table1, bbr_id->lba_table2,
			 bbr_id->nr_sects_bbr_table,
			 bbr_id->start_replacement_sect,
			 bbr_id->nr_replacement_blks,
			 bbr_id->blksize_in_sects << SECTOR_SHIFT);
		 break;
	}
	return 0;
}

static struct target_type bbr_target = {
	name:	"bbr",
	module:	THIS_MODULE,
	ctr:	bbr_ctr,
	dtr:	bbr_dtr,
	map:	bbr_map,
	end_io:	bbr_callback,
	status:	bbr_status,
};

int __init dm_bbr_init(void)
{
	int r = dm_register_target(&bbr_target);

	if (r < 0)
		DMERR("dm-bbr: register failed %d", r);

	return r;
}

void __exit dm_bbr_exit(void)
{
	int r = dm_unregister_target(&bbr_target);

	if (r < 0)
		DMERR("dm-bbr: unregister failed %d", r);
}

module_init(dm_bbr_init);
module_exit(dm_bbr_exit);
MODULE_LICENSE("GPL");
